[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [PATCH v3 27/34] target-arm: emulate LL/SC using cmpxchg he
From: |
Richard Henderson |
Subject: |
[Qemu-devel] [PATCH v3 27/34] target-arm: emulate LL/SC using cmpxchg helpers |
Date: |
Sat, 3 Sep 2016 13:39:55 -0700 |
From: "Emilio G. Cota" <address@hidden>
Emulating LL/SC with cmpxchg is not correct, since it can
suffer from the ABA problem. Portable parallel code, however,
is written assuming only cmpxchg--and not LL/SC--is available.
This means that in practice emulating LL/SC with cmpxchg is
a viable alternative.
The appended emulates LL/SC pairs in ARM with cmpxchg helpers.
This works in both user and system mode. In usermode, it avoids
pausing all other CPUs to perform the LL/SC pair. The subsequent
performance and scalability improvement is significant, as the
plots below show. They plot the throughput of atomic_add-bench
compiled for ARM and executed on a 64-core x86 machine.
Hi-res plots: http://imgur.com/a/aNQpB
atomic_add-bench: 1000000 ops/thread, [0,1] range
9 ++---------+----------+----------+----------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
8 +Emaster +-H--+ ++
| | |
7 ++E ++
| | |
6 ++++ ++
| | |
5 ++ | ++
4 ++ | ++
| | |
3 ++ | ++
| | |
2 ++ | ++
|H++E+--- +++ ---+E+------+E+------+E|
1 +++ +E+-----+E+------+E+------+E+------+E+-- +++ +++ ++
++H+ + +++ + +++ ++++ + + + |
0 ++--H----H-+-----H----+----------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 1000000 ops/thread, [0,2] range
16 ++---------+----------+---------+----------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
14 ++master +-H--+ ++
| | |
12 ++| ++
| E |
10 ++| ++
| | |
8 ++++ ++
|E+| |
| | |
6 ++ | ++
| | |
4 ++ | ++
| +E+--- +++ +++ +++ ---+E+------+E|
2 +H+ +E+------E-------+E+-----+E+------+E+------+E+-- +++
+ | + +++ + ++++ + + + |
0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 1000000 ops/thread, [0,128] range
70 ++---------+----------+---------+----------+----------+----------+---++
+cmpxchg +-E--+ + + + ++++ + |
60 ++master +-H--+ ----E------+E+-------++
| -+E+--- +++ +++ +E|
| +++ ---- +++ ++|
50 ++ +++ ---+E+- ++
| -E--- |
40 ++ ---+++ ++
| +++--- |
| -+E+ |
30 ++ +++---- ++
| +E+ |
20 ++ +++-- ++
| +E+ |
|+E+ |
10 +E+ ++
+ + + + + + + |
0 +HH-H----H-+-----H----+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 1000000 ops/thread, [0,1024] range
120 ++---------+---------+----------+---------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
| master +-H--+ ++|
100 ++ ----E+
| +++ ---+E+--- ++|
| --E--- +++ |
80 ++ ---- +++ ++
| ---+E+- |
60 ++ -+E+-- ++
| +++ ---- +++ |
| -+E+- |
40 ++ +++---- ++
| +++ ---+E+ |
| -+E+--- |
20 ++ +E+ ++
|+E+++ |
+E+ + + + + + + |
0 +HH-H---H--+-----H---+----------+---------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
[rth: Enforce alignment for ldrexd.]
Signed-off-by: Emilio G. Cota <address@hidden>
Message-Id: <address@hidden>
Signed-off-by: Richard Henderson <address@hidden>
---
target-arm/translate.c | 136 +++++++++++++++----------------------------------
1 file changed, 42 insertions(+), 94 deletions(-)
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 1b5bf87..680635c 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -7676,47 +7676,27 @@ static void gen_logicq_cc(TCGv_i32 lo, TCGv_i32 hi)
tcg_gen_or_i32(cpu_ZF, lo, hi);
}
-/* Load/Store exclusive instructions are implemented by remembering
- the value/address loaded, and seeing if these are the same
- when the store is performed. This should be sufficient to implement
- the architecturally mandated semantics, and avoids having to monitor
- regular stores.
-
- In system emulation mode only one CPU will be running at once, so
- this sequence is effectively atomic. In user emulation mode we
- throw an exception and handle the atomic operation elsewhere. */
static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
TCGv_i32 addr, int size)
{
TCGv_i32 tmp = tcg_temp_new_i32();
+ TCGMemOp opc = size | MO_ALIGN | s->be_data;
s->is_ldex = true;
- switch (size) {
- case 0:
- gen_aa32_ld8u(s, tmp, addr, get_mem_index(s));
- break;
- case 1:
- gen_aa32_ld16ua(s, tmp, addr, get_mem_index(s));
- break;
- case 2:
- case 3:
- gen_aa32_ld32ua(s, tmp, addr, get_mem_index(s));
- break;
- default:
- abort();
- }
-
if (size == 3) {
TCGv_i32 tmp2 = tcg_temp_new_i32();
- TCGv_i32 tmp3 = tcg_temp_new_i32();
+ TCGv_i64 t64 = tcg_temp_new_i64();
+
+ gen_aa32_ld_i64(s, t64, addr, get_mem_index(s), opc);
+ tcg_gen_mov_i64(cpu_exclusive_val, t64);
+ tcg_gen_extr_i64_i32(tmp, tmp2, t64);
+ tcg_temp_free_i64(t64);
- tcg_gen_addi_i32(tmp2, addr, 4);
- gen_aa32_ld32u(s, tmp3, tmp2, get_mem_index(s));
+ store_reg(s, rt2, tmp2);
tcg_temp_free_i32(tmp2);
- tcg_gen_concat_i32_i64(cpu_exclusive_val, tmp, tmp3);
- store_reg(s, rt2, tmp3);
} else {
+ gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), opc);
tcg_gen_extu_i32_i64(cpu_exclusive_val, tmp);
}
@@ -7729,23 +7709,15 @@ static void gen_clrex(DisasContext *s)
tcg_gen_movi_i64(cpu_exclusive_addr, -1);
}
-#ifdef CONFIG_USER_ONLY
-static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
- TCGv_i32 addr, int size)
-{
- tcg_gen_extu_i32_i64(cpu_exclusive_test, addr);
- tcg_gen_movi_i32(cpu_exclusive_info,
- size | (rd << 4) | (rt << 8) | (rt2 << 12));
- gen_exception_internal_insn(s, 4, EXCP_STREX);
-}
-#else
static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
TCGv_i32 addr, int size)
{
- TCGv_i32 tmp;
- TCGv_i64 val64, extaddr;
+ TCGv_i32 t0, t1, t2;
+ TCGv_i64 extaddr;
+ TCGv taddr;
TCGLabel *done_label;
TCGLabel *fail_label;
+ TCGMemOp opc = size | MO_ALIGN | s->be_data;
/* if (env->exclusive_addr == addr && env->exclusive_val == [addr]) {
[addr] = {Rt};
@@ -7760,69 +7732,45 @@ static void gen_store_exclusive(DisasContext *s, int
rd, int rt, int rt2,
tcg_gen_brcond_i64(TCG_COND_NE, extaddr, cpu_exclusive_addr, fail_label);
tcg_temp_free_i64(extaddr);
- tmp = tcg_temp_new_i32();
- switch (size) {
- case 0:
- gen_aa32_ld8u(s, tmp, addr, get_mem_index(s));
- break;
- case 1:
- gen_aa32_ld16u(s, tmp, addr, get_mem_index(s));
- break;
- case 2:
- case 3:
- gen_aa32_ld32u(s, tmp, addr, get_mem_index(s));
- break;
- default:
- abort();
- }
-
- val64 = tcg_temp_new_i64();
+ taddr = gen_aa32_addr(s, addr, opc);
+ t0 = tcg_temp_new_i32();
+ t1 = load_reg(s, rt);
if (size == 3) {
- TCGv_i32 tmp2 = tcg_temp_new_i32();
- TCGv_i32 tmp3 = tcg_temp_new_i32();
- tcg_gen_addi_i32(tmp2, addr, 4);
- gen_aa32_ld32u(s, tmp3, tmp2, get_mem_index(s));
- tcg_temp_free_i32(tmp2);
- tcg_gen_concat_i32_i64(val64, tmp, tmp3);
- tcg_temp_free_i32(tmp3);
- } else {
- tcg_gen_extu_i32_i64(val64, tmp);
- }
- tcg_temp_free_i32(tmp);
+ TCGv_i64 o64 = tcg_temp_new_i64();
+ TCGv_i64 n64 = tcg_temp_new_i64();
- tcg_gen_brcond_i64(TCG_COND_NE, val64, cpu_exclusive_val, fail_label);
- tcg_temp_free_i64(val64);
+ t2 = load_reg(s, rt2);
+ tcg_gen_concat_i32_i64(n64, t1, t2);
+ tcg_temp_free_i32(t2);
+ gen_aa32_frob64(s, n64);
- tmp = load_reg(s, rt);
- switch (size) {
- case 0:
- gen_aa32_st8(s, tmp, addr, get_mem_index(s));
- break;
- case 1:
- gen_aa32_st16(s, tmp, addr, get_mem_index(s));
- break;
- case 2:
- case 3:
- gen_aa32_st32(s, tmp, addr, get_mem_index(s));
- break;
- default:
- abort();
- }
- tcg_temp_free_i32(tmp);
- if (size == 3) {
- tcg_gen_addi_i32(addr, addr, 4);
- tmp = load_reg(s, rt2);
- gen_aa32_st32(s, tmp, addr, get_mem_index(s));
- tcg_temp_free_i32(tmp);
+ tcg_gen_atomic_cmpxchg_i64(o64, taddr, cpu_exclusive_val, n64,
+ get_mem_index(s), opc);
+ tcg_temp_free_i64(n64);
+
+ gen_aa32_frob64(s, o64);
+ tcg_gen_setcond_i64(TCG_COND_NE, o64, o64, cpu_exclusive_val);
+ tcg_gen_extrl_i64_i32(t0, o64);
+
+ tcg_temp_free_i64(o64);
+ } else {
+ t2 = tcg_temp_new_i32();
+ tcg_gen_extrl_i64_i32(t2, cpu_exclusive_val);
+ tcg_gen_atomic_cmpxchg_i32(t0, taddr, t2, t1, get_mem_index(s), opc);
+ tcg_gen_setcond_i32(TCG_COND_NE, t0, t0, t2);
+ tcg_temp_free_i32(t2);
}
- tcg_gen_movi_i32(cpu_R[rd], 0);
+ tcg_temp_free_i32(t1);
+ tcg_temp_free(taddr);
+ tcg_gen_mov_i32(cpu_R[rd], t0);
+ tcg_temp_free_i32(t0);
tcg_gen_br(done_label);
+
gen_set_label(fail_label);
tcg_gen_movi_i32(cpu_R[rd], 1);
gen_set_label(done_label);
tcg_gen_movi_i64(cpu_exclusive_addr, -1);
}
-#endif
/* gen_srs:
* @env: CPUARMState
--
2.7.4
- Re: [Qemu-devel] [PATCH v3 15/34] tcg: Add CONFIG_ATOMIC64, (continued)
- [Qemu-devel] [PATCH v3 24/34] target-i386: remove helper_lock(), Richard Henderson, 2016/09/03
- [Qemu-devel] [PATCH v3 21/34] target-i386: emulate LOCK'ed XADD using atomic helper, Richard Henderson, 2016/09/03
- [Qemu-devel] [PATCH v3 25/34] tests: add atomic_add-bench, Richard Henderson, 2016/09/03
- [Qemu-devel] [PATCH v3 23/34] target-i386: emulate XCHG using atomic helper, Richard Henderson, 2016/09/03
- [Qemu-devel] [PATCH v3 20/34] target-i386: emulate LOCK'ed NEG using cmpxchg helper, Richard Henderson, 2016/09/03
- [Qemu-devel] [PATCH v3 22/34] target-i386: emulate LOCK'ed BTX ops using atomic helpers, Richard Henderson, 2016/09/03
- [Qemu-devel] [PATCH v3 27/34] target-arm: emulate LL/SC using cmpxchg helpers,
Richard Henderson <=
- [Qemu-devel] [PATCH v3 26/34] target-arm: Rearrange aa32 load and store functions, Richard Henderson, 2016/09/03
- [Qemu-devel] [PATCH v3 32/34] target-arm: remove EXCP_STREX + cpu_exclusive_{test, info}, Richard Henderson, 2016/09/03
- [Qemu-devel] [PATCH v3 30/34] linux-user: remove handling of ARM's EXCP_STREX, Richard Henderson, 2016/09/03
- [Qemu-devel] [PATCH v3 28/34] target-arm: emulate SWP with atomic_xchg helper, Richard Henderson, 2016/09/03