qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PULL 08/15] qdist: add module to represent frequency distr


From: Richard Henderson
Subject: [Qemu-devel] [PULL 08/15] qdist: add module to represent frequency distributions of data
Date: Fri, 10 Jun 2016 07:26:46 -0700

From: "Emilio G. Cota" <address@hidden>

Sometimes it is useful to have a quick histogram to represent a certain
distribution -- for example, when investigating a performance regression
in a hash table due to inadequate hashing.

The appended allows us to easily represent a distribution using Unicode
characters. Further, the data structure keeping track of the distribution
is so simple that obtaining its values for off-line processing is trivial.

Example, taking the last 10 commits to QEMU:

 Characters in commit title  Count
-----------------------------------
                         39      1
                         48      1
                         53      1
                         54      2
                         57      1
                         61      1
                         67      1
                         78      1
                         80      1
qdist_init(&dist);
qdist_inc(&dist, 39);
[...]
qdist_inc(&dist, 80);

char *str = qdist_pr(&dist, 9, QDIST_PR_LABELS);
// -> [39.0,43.6)▂▂ █▂ ▂ ▄[75.4,80.0]
g_free(str);

char *str = qdist_pr(&dist, 4, QDIST_PR_LABELS);
// -> [39.0,49.2)▁█▁▁[69.8,80.0]
g_free(str);

Reviewed-by: Richard Henderson <address@hidden>
Signed-off-by: Emilio G. Cota <address@hidden>
Message-Id: <address@hidden>
Signed-off-by: Richard Henderson <address@hidden>
---
 include/qemu/qdist.h |  63 ++++++++
 util/Makefile.objs   |   1 +
 util/qdist.c         | 395 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 459 insertions(+)
 create mode 100644 include/qemu/qdist.h
 create mode 100644 util/qdist.c

diff --git a/include/qemu/qdist.h b/include/qemu/qdist.h
new file mode 100644
index 0000000..f30050c
--- /dev/null
+++ b/include/qemu/qdist.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (C) 2016, Emilio G. Cota <address@hidden>
+ *
+ * License: GNU GPL, version 2 or later.
+ *   See the COPYING file in the top-level directory.
+ */
+#ifndef QEMU_QDIST_H
+#define QEMU_QDIST_H
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qemu/bitops.h"
+
+/*
+ * Samples with the same 'x value' end up in the same qdist_entry,
+ * e.g. inc(0.1) and inc(0.1) end up as {x=0.1, count=2}.
+ *
+ * Binning happens only at print time, so that we retain the flexibility to
+ * choose the binning. This might not be ideal for workloads that do not care
+ * much about precision and insert many samples all with different x values;
+ * in that case, pre-binning (e.g. entering both 0.115 and 0.097 as 0.1)
+ * should be considered.
+ */
+struct qdist_entry {
+    double x;
+    unsigned long count;
+};
+
+struct qdist {
+    struct qdist_entry *entries;
+    size_t n;
+    size_t size;
+};
+
+#define QDIST_PR_BORDER     BIT(0)
+#define QDIST_PR_LABELS     BIT(1)
+/* the remaining options only work if PR_LABELS is set */
+#define QDIST_PR_NODECIMAL  BIT(2)
+#define QDIST_PR_PERCENT    BIT(3)
+#define QDIST_PR_100X       BIT(4)
+#define QDIST_PR_NOBINRANGE BIT(5)
+
+void qdist_init(struct qdist *dist);
+void qdist_destroy(struct qdist *dist);
+
+void qdist_add(struct qdist *dist, double x, long count);
+void qdist_inc(struct qdist *dist, double x);
+double qdist_xmin(const struct qdist *dist);
+double qdist_xmax(const struct qdist *dist);
+double qdist_avg(const struct qdist *dist);
+unsigned long qdist_sample_count(const struct qdist *dist);
+size_t qdist_unique_entries(const struct qdist *dist);
+
+/* callers must free the returned string with g_free() */
+char *qdist_pr_plain(const struct qdist *dist, size_t n_groups);
+
+/* callers must free the returned string with g_free() */
+char *qdist_pr(const struct qdist *dist, size_t n_groups, uint32_t opt);
+
+/* Only qdist code and test code should ever call this function */
+void qdist_bin__internal(struct qdist *to, const struct qdist *from, size_t n);
+
+#endif /* QEMU_QDIST_H */
diff --git a/util/Makefile.objs b/util/Makefile.objs
index a8a777e..702435e 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -32,3 +32,4 @@ util-obj-y += buffer.o
 util-obj-y += timed-average.o
 util-obj-y += base64.o
 util-obj-y += log.o
+util-obj-y += qdist.o
diff --git a/util/qdist.c b/util/qdist.c
new file mode 100644
index 0000000..4ea2e34
--- /dev/null
+++ b/util/qdist.c
@@ -0,0 +1,395 @@
+/*
+ * qdist.c - QEMU helpers for handling frequency distributions of data.
+ *
+ * Copyright (C) 2016, Emilio G. Cota <address@hidden>
+ *
+ * License: GNU GPL, version 2 or later.
+ *   See the COPYING file in the top-level directory.
+ */
+#include "qemu/qdist.h"
+
+#include <math.h>
+#ifndef NAN
+#define NAN (0.0 / 0.0)
+#endif
+
+void qdist_init(struct qdist *dist)
+{
+    dist->entries = g_malloc(sizeof(*dist->entries));
+    dist->size = 1;
+    dist->n = 0;
+}
+
+void qdist_destroy(struct qdist *dist)
+{
+    g_free(dist->entries);
+}
+
+static inline int qdist_cmp_double(double a, double b)
+{
+    if (a > b) {
+        return 1;
+    } else if (a < b) {
+        return -1;
+    }
+    return 0;
+}
+
+static int qdist_cmp(const void *ap, const void *bp)
+{
+    const struct qdist_entry *a = ap;
+    const struct qdist_entry *b = bp;
+
+    return qdist_cmp_double(a->x, b->x);
+}
+
+void qdist_add(struct qdist *dist, double x, long count)
+{
+    struct qdist_entry *entry = NULL;
+
+    if (dist->n) {
+        struct qdist_entry e;
+
+        e.x = x;
+        entry = bsearch(&e, dist->entries, dist->n, sizeof(e), qdist_cmp);
+    }
+
+    if (entry) {
+        entry->count += count;
+        return;
+    }
+
+    if (unlikely(dist->n == dist->size)) {
+        dist->size *= 2;
+        dist->entries = g_realloc(dist->entries,
+                                  sizeof(*dist->entries) * (dist->size));
+    }
+    dist->n++;
+    entry = &dist->entries[dist->n - 1];
+    entry->x = x;
+    entry->count = count;
+    qsort(dist->entries, dist->n, sizeof(*entry), qdist_cmp);
+}
+
+void qdist_inc(struct qdist *dist, double x)
+{
+    qdist_add(dist, x, 1);
+}
+
+/*
+ * Unicode for block elements. See:
+ *   https://en.wikipedia.org/wiki/Block_Elements
+ */
+static const gunichar qdist_blocks[] = {
+    0x2581,
+    0x2582,
+    0x2583,
+    0x2584,
+    0x2585,
+    0x2586,
+    0x2587,
+    0x2588
+};
+
+#define QDIST_NR_BLOCK_CODES ARRAY_SIZE(qdist_blocks)
+
+/*
+ * Print a distribution into a string.
+ *
+ * This function assumes that appropriate binning has been done on the input;
+ * see qdist_bin__internal() and qdist_pr_plain().
+ *
+ * Callers must free the returned string with g_free().
+ */
+static char *qdist_pr_internal(const struct qdist *dist)
+{
+    double min, max;
+    GString *s = g_string_new("");
+    size_t i;
+
+    /* if only one entry, its printout will be either full or empty */
+    if (dist->n == 1) {
+        if (dist->entries[0].count) {
+            g_string_append_unichar(s, qdist_blocks[QDIST_NR_BLOCK_CODES - 1]);
+        } else {
+            g_string_append_c(s, ' ');
+        }
+        goto out;
+    }
+
+    /* get min and max counts */
+    min = dist->entries[0].count;
+    max = min;
+    for (i = 0; i < dist->n; i++) {
+        struct qdist_entry *e = &dist->entries[i];
+
+        if (e->count < min) {
+            min = e->count;
+        }
+        if (e->count > max) {
+            max = e->count;
+        }
+    }
+
+    for (i = 0; i < dist->n; i++) {
+        struct qdist_entry *e = &dist->entries[i];
+        int index;
+
+        /* make an exception with 0; instead of using block[0], print a space 
*/
+        if (e->count) {
+            /* divide first to avoid loss of precision when e->count == max */
+            index = (e->count - min) / (max - min) * (QDIST_NR_BLOCK_CODES - 
1);
+            g_string_append_unichar(s, qdist_blocks[index]);
+        } else {
+            g_string_append_c(s, ' ');
+        }
+    }
+ out:
+    return g_string_free(s, FALSE);
+}
+
+/*
+ * Bin the distribution in @from into @n bins of consecutive, non-overlapping
+ * intervals, copying the result to @to.
+ *
+ * This function is internal to qdist: only this file and test code should
+ * ever call it.
+ *
+ * Note: calling this function on an already-binned qdist is a bug.
+ *
+ * If @n == 0 or @from->n == 1, use @from->n.
+ */
+void qdist_bin__internal(struct qdist *to, const struct qdist *from, size_t n)
+{
+    double xmin, xmax;
+    double step;
+    size_t i, j;
+
+    qdist_init(to);
+
+    if (from->n == 0) {
+        return;
+    }
+    if (n == 0 || from->n == 1) {
+        n = from->n;
+    }
+
+    /* set equally-sized bins between @from's left and right */
+    xmin = qdist_xmin(from);
+    xmax = qdist_xmax(from);
+    step = (xmax - xmin) / n;
+
+    if (n == from->n) {
+        /* if @from's entries are equally spaced, no need to re-bin */
+        for (i = 0; i < from->n; i++) {
+            if (from->entries[i].x != xmin + i * step) {
+                goto rebin;
+            }
+        }
+        /* they're equally spaced, so copy the dist and bail out */
+        to->entries = g_new(struct qdist_entry, from->n);
+        to->n = from->n;
+        memcpy(to->entries, from->entries, sizeof(*to->entries) * to->n);
+        return;
+    }
+
+ rebin:
+    j = 0;
+    for (i = 0; i < n; i++) {
+        double x;
+        double left, right;
+
+        left = xmin + i * step;
+        right = xmin + (i + 1) * step;
+
+        /* Add x, even if it might not get any counts later */
+        x = left;
+        qdist_add(to, x, 0);
+
+        /*
+         * To avoid double-counting we capture [left, right) ranges, except for
+         * the righmost bin, which captures a [left, right] range.
+         */
+        while (j < from->n && (from->entries[j].x < right || i == n - 1)) {
+            struct qdist_entry *o = &from->entries[j];
+
+            qdist_add(to, x, o->count);
+            j++;
+        }
+    }
+}
+
+/*
+ * Print @dist into a string, after re-binning it into @n bins of consecutive,
+ * non-overlapping intervals.
+ *
+ * If @n == 0, use @orig->n.
+ *
+ * Callers must free the returned string with g_free().
+ */
+char *qdist_pr_plain(const struct qdist *dist, size_t n)
+{
+    struct qdist binned;
+    char *ret;
+
+    if (dist->n == 0) {
+        return NULL;
+    }
+    qdist_bin__internal(&binned, dist, n);
+    ret = qdist_pr_internal(&binned);
+    qdist_destroy(&binned);
+    return ret;
+}
+
+static char *qdist_pr_label(const struct qdist *dist, size_t n_bins,
+                            uint32_t opt, bool is_left)
+{
+    const char *percent;
+    const char *lparen;
+    const char *rparen;
+    GString *s;
+    double x1, x2, step;
+    double x;
+    double n;
+    int dec;
+
+    s = g_string_new("");
+    if (!(opt & QDIST_PR_LABELS)) {
+        goto out;
+    }
+
+    dec = opt & QDIST_PR_NODECIMAL ? 0 : 1;
+    percent = opt & QDIST_PR_PERCENT ? "%" : "";
+
+    n = n_bins ? n_bins : dist->n;
+    x = is_left ? qdist_xmin(dist) : qdist_xmax(dist);
+    step = (qdist_xmax(dist) - qdist_xmin(dist)) / n;
+
+    if (opt & QDIST_PR_100X) {
+        x *= 100.0;
+        step *= 100.0;
+    }
+    if (opt & QDIST_PR_NOBINRANGE) {
+        lparen = rparen = "";
+        x1 = x;
+        x2 = x; /* unnecessary, but a dumb compiler might not figure it out */
+    } else {
+        lparen = "[";
+        rparen = is_left ? ")" : "]";
+        if (is_left) {
+            x1 = x;
+            x2 = x + step;
+        } else {
+            x1 = x - step;
+            x2 = x;
+        }
+    }
+    g_string_append_printf(s, "%s%.*f", lparen, dec, x1);
+    if (!(opt & QDIST_PR_NOBINRANGE)) {
+        g_string_append_printf(s, ",%.*f%s", dec, x2, rparen);
+    }
+    g_string_append(s, percent);
+ out:
+    return g_string_free(s, FALSE);
+}
+
+/*
+ * Print the distribution's histogram into a string.
+ *
+ * See also: qdist_pr_plain().
+ *
+ * Callers must free the returned string with g_free().
+ */
+char *qdist_pr(const struct qdist *dist, size_t n_bins, uint32_t opt)
+{
+    const char *border = opt & QDIST_PR_BORDER ? "|" : "";
+    char *llabel, *rlabel;
+    char *hgram;
+    GString *s;
+
+    if (dist->n == 0) {
+        return NULL;
+    }
+
+    s = g_string_new("");
+
+    llabel = qdist_pr_label(dist, n_bins, opt, true);
+    rlabel = qdist_pr_label(dist, n_bins, opt, false);
+    hgram = qdist_pr_plain(dist, n_bins);
+    g_string_append_printf(s, "%s%s%s%s%s",
+                           llabel, border, hgram, border, rlabel);
+    g_free(llabel);
+    g_free(rlabel);
+    g_free(hgram);
+
+    return g_string_free(s, FALSE);
+}
+
+static inline double qdist_x(const struct qdist *dist, int index)
+{
+    if (dist->n == 0) {
+        return NAN;
+    }
+    return dist->entries[index].x;
+}
+
+double qdist_xmin(const struct qdist *dist)
+{
+    return qdist_x(dist, 0);
+}
+
+double qdist_xmax(const struct qdist *dist)
+{
+    return qdist_x(dist, dist->n - 1);
+}
+
+size_t qdist_unique_entries(const struct qdist *dist)
+{
+    return dist->n;
+}
+
+unsigned long qdist_sample_count(const struct qdist *dist)
+{
+    unsigned long count = 0;
+    size_t i;
+
+    for (i = 0; i < dist->n; i++) {
+        struct qdist_entry *e = &dist->entries[i];
+
+        count += e->count;
+    }
+    return count;
+}
+
+static double qdist_pairwise_avg(const struct qdist *dist, size_t index,
+                                 size_t n, unsigned long count)
+{
+    /* amortize the recursion by using a base case > 2 */
+    if (n <= 8) {
+        size_t i;
+        double ret = 0;
+
+        for (i = 0; i < n; i++) {
+            struct qdist_entry *e = &dist->entries[index + i];
+
+            ret += e->x * e->count / count;
+        }
+        return ret;
+    } else {
+        size_t n2 = n / 2;
+
+        return qdist_pairwise_avg(dist, index, n2, count) +
+               qdist_pairwise_avg(dist, index + n2, n - n2, count);
+    }
+}
+
+double qdist_avg(const struct qdist *dist)
+{
+    unsigned long count;
+
+    count = qdist_sample_count(dist);
+    if (!count) {
+        return NAN;
+    }
+    return qdist_pairwise_avg(dist, 0, dist->n, count);
+}
-- 
2.5.5




reply via email to

[Prev in Thread] Current Thread [Next in Thread]