qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v3 25/34] tests: add atomic_add-bench


From: Alex Bennée
Subject: Re: [Qemu-devel] [PATCH v3 25/34] tests: add atomic_add-bench
Date: Wed, 14 Sep 2016 14:53:14 +0100
User-agent: mu4e 0.9.17; emacs 25.1.12

Richard Henderson <address@hidden> writes:

> From: "Emilio G. Cota" <address@hidden>
>
> With this microbenchmark we can measure the overhead of emulating atomic
> instructions with a configurable degree of contention.
>
> The benchmark spawns $n threads, each performing $o atomic ops (additions)
> in a loop. Each atomic operation is performed on a different cache line
> (assuming lines are 64b long) that is randomly selected from a range [0, $r).
>
> [ Note: each $foo corresponds to a -foo flag ]
>
> Signed-off-by: Emilio G. Cota <address@hidden>
> Signed-off-by: Richard Henderson <address@hidden>
> Message-Id: <address@hidden>
> ---
>  tests/.gitignore         |   1 +
>  tests/Makefile.include   |   4 +-
>  tests/atomic_add-bench.c | 180 
> +++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 184 insertions(+), 1 deletion(-)
>  create mode 100644 tests/atomic_add-bench.c
>
> diff --git a/tests/.gitignore b/tests/.gitignore
> index dbb5263..ec3137a 100644
> --- a/tests/.gitignore
> +++ b/tests/.gitignore
> @@ -1,3 +1,4 @@
> +atomic_add-bench
>  check-qdict
>  check-qfloat
>  check-qint
> diff --git a/tests/Makefile.include b/tests/Makefile.include
> index 14be491..e1957ed 100644
> --- a/tests/Makefile.include
> +++ b/tests/Makefile.include
> @@ -421,7 +421,8 @@ test-obj-y = tests/check-qint.o tests/check-qstring.o 
> tests/check-qdict.o \
>       tests/test-opts-visitor.o tests/test-qmp-event.o \
>       tests/rcutorture.o tests/test-rcu-list.o \
>       tests/test-qdist.o \
> -     tests/test-qht.o tests/qht-bench.o tests/test-qht-par.o
> +     tests/test-qht.o tests/qht-bench.o tests/test-qht-par.o \
> +     tests/atomic_add-bench.o
>
>  $(test-obj-y): QEMU_INCLUDES += -Itests
>  QEMU_CFLAGS += -I$(SRC_PATH)/tests
> @@ -465,6 +466,7 @@ tests/test-qdist$(EXESUF): tests/test-qdist.o 
> $(test-util-obj-y)
>  tests/test-qht$(EXESUF): tests/test-qht.o $(test-util-obj-y)
>  tests/test-qht-par$(EXESUF): tests/test-qht-par.o tests/qht-bench$(EXESUF) 
> $(test-util-obj-y)
>  tests/qht-bench$(EXESUF): tests/qht-bench.o $(test-util-obj-y)
> +tests/atomic_add-bench$(EXESUF): tests/atomic_add-bench.o
>  $(test-util-obj-y)

This probably more properly lives in tests/tcg/generic or some such but
that needs the tcg/tests being rehabilitated into the build system so at
least here it gets built.

>
>  tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \
>       hw/core/qdev.o hw/core/qdev-properties.o hw/core/hotplug.o\
> diff --git a/tests/atomic_add-bench.c b/tests/atomic_add-bench.c
> new file mode 100644
> index 0000000..5bbecf6
> --- /dev/null
> +++ b/tests/atomic_add-bench.c

I wonder if this would be worth making atomic-bench and adding the other
atomic operations into the benchmark? I know given the current helper
overhead its unlikely to show much difference between the ops but if we
move to backend support for the tcg atomics it would be a useful tool to
have.

> @@ -0,0 +1,180 @@
> +#include "qemu/osdep.h"
> +#include "qemu/thread.h"
> +#include "qemu/host-utils.h"
> +#include "qemu/processor.h"
> +
> +struct thread_info {
> +    uint64_t r;
> +} QEMU_ALIGNED(64);
> +
> +struct count {
> +    unsigned long val;
> +} QEMU_ALIGNED(64);
> +
> +static QemuThread *threads;
> +static struct thread_info *th_info;
> +static unsigned int n_threads = 1;
> +static unsigned int n_ready_threads;
> +static struct count *counts;
> +static unsigned long n_ops = 10000;
> +static double duration;
> +static unsigned int range = 1;
> +static bool test_start;
> +
> +static const char commands_string[] =
> +    " -n = number of threads\n"
> +    " -o = number of ops per thread\n"
> +    " -r = range (will be rounded up to pow2)";
> +
> +static void usage_complete(char *argv[])
> +{
> +    fprintf(stderr, "Usage: %s [options]\n", argv[0]);
> +    fprintf(stderr, "options:\n%s\n", commands_string);
> +}
> +
> +/*
> + * From: https://en.wikipedia.org/wiki/Xorshift
> + * This is faster than rand_r(), and gives us a wider range (RAND_MAX is only
> + * guaranteed to be >= INT_MAX).
> + */
> +static uint64_t xorshift64star(uint64_t x)
> +{
> +    x ^= x >> 12; /* a */
> +    x ^= x << 25; /* b */
> +    x ^= x >> 27; /* c */
> +    return x * UINT64_C(2685821657736338717);
> +}
> +
> +static void *thread_func(void *arg)
> +{
> +    struct thread_info *info = arg;
> +    unsigned long i;
> +
> +    atomic_inc(&n_ready_threads);
> +    while (!atomic_mb_read(&test_start)) {
> +        cpu_relax();
> +    }
> +
> +    for (i = 0; i < n_ops; i++) {
> +        unsigned int index;
> +
> +        info->r = xorshift64star(info->r);
> +        index = info->r & (range - 1);
> +        atomic_inc(&counts[index].val);
> +    }
> +    return NULL;
> +}
> +
> +static inline
> +uint64_t ts_subtract(const struct timespec *a, const struct timespec *b)
> +{
> +    uint64_t ns;
> +
> +    ns = (b->tv_sec - a->tv_sec) * 1000000000ULL;
> +    ns += (b->tv_nsec - a->tv_nsec);
> +    return ns;
> +}
> +
> +static void run_test(void)
> +{
> +    unsigned int i;
> +    struct timespec ts_start, ts_end;
> +
> +    while (atomic_read(&n_ready_threads) != n_threads) {
> +        cpu_relax();
> +    }
> +    atomic_mb_set(&test_start, true);
> +
> +    clock_gettime(CLOCK_MONOTONIC, &ts_start);
> +    for (i = 0; i < n_threads; i++) {
> +        qemu_thread_join(&threads[i]);
> +    }
> +    clock_gettime(CLOCK_MONOTONIC, &ts_end);
> +    duration = ts_subtract(&ts_start, &ts_end) / 1e9;
> +}
> +
> +static void create_threads(void)
> +{
> +    unsigned int i;
> +
> +    threads = g_new(QemuThread, n_threads);
> +    th_info = g_new(struct thread_info, n_threads);
> +    counts = qemu_memalign(64, sizeof(*counts) * range);

This fails on my setup as AFAICT qemu_memalign doesn't give you zeroed
memory. I added a memset after to zero it out.

> +
> +    for (i = 0; i < n_threads; i++) {
> +        struct thread_info *info = &th_info[i];
> +
> +        info->r = (i + 1) ^ time(NULL);
> +        qemu_thread_create(&threads[i], NULL, thread_func, info,
> +                           QEMU_THREAD_JOINABLE);
> +    }
> +}
> +
> +static void pr_params(void)
> +{
> +    printf("Parameters:\n");
> +    printf(" # of threads:      %u\n", n_threads);
> +    printf(" n_ops:             %lu\n", n_ops);
> +    printf(" ops' range:        %u\n", range);
> +}
> +
> +static void pr_stats(void)
> +{
> +    unsigned long long val = 0;
> +    unsigned int i;
> +    double tx;
> +
> +    for (i = 0; i < range; i++) {
> +        val += counts[i].val;
> +    }
> +    assert(val == n_threads * n_ops);

Again while I was testing this failed due to the above. It would proably
also be worth reporting the fail condition for the test so my current
hacky patch looks like:

modified   tests/atomic_add-bench.c
@@ -100,6 +100,7 @@ static void create_threads(void)
     threads = g_new(QemuThread, n_threads);
     th_info = g_new(struct thread_info, n_threads);
     counts = qemu_memalign(64, sizeof(*counts) * range);
+    memset(counts, 0, sizeof(*counts) * range);

     for (i = 0; i < n_threads; i++) {
         struct thread_info *info = &th_info[i];
@@ -118,22 +119,29 @@ static void pr_params(void)
     printf(" ops' range:        %u\n", range);
 }

-static void pr_stats(void)
+static int pr_stats(void)
 {
-    unsigned long long val = 0;
+    unsigned long long target_val, val = 0;
     unsigned int i;
     double tx;

     for (i = 0; i < range; i++) {
         val += counts[i].val;
     }
-    assert(val == n_threads * n_ops);
+
+    target_val = (n_threads * n_ops);
+    if (val != target_val) {
+        printf("Bad total: %llu vs %llu\n", val, target_val);
+        return -1;
+    };
     tx = val / duration / 1e6;

     printf("Results:\n");
     printf("Duration:            %.2f s\n", duration);
     printf(" Throughput:         %.2f Mops/s\n", tx);
     printf(" Throughput/thread:  %.2f Mops/s/thread\n", tx / n_threads);
+
+    return 0;
 }

 static void parse_args(int argc, char *argv[])
@@ -175,6 +183,5 @@ int main(int argc, char *argv[])
     pr_params();
     create_threads();
     run_test();
-    pr_stats();
-    return 0;
+    return pr_stats();
 }

--
Alex Bennée



reply via email to

[Prev in Thread] Current Thread [Next in Thread]