Re: [RFC PATCH 1/3] plugins: Added a new cache modelling plugin

On Tue, Jun 1, 2021 at 1:12 PM Alex Bennée <alex.bennee@linaro.org> wrote:

Mahmoud Mandour <ma.mandourr@gmail.com> writes:

> Added a cache modelling plugin that uses a static configuration used in
> many of the commercial microprocessors and uses random eviction policy.
>
> Signed-off-by: Mahmoud Mandour <ma.mandourr@gmail.com>
> ---
> contrib/plugins/Makefile | 1 +
> contrib/plugins/cache.c | 398 +++++++++++++++++++++++++++++++++++++++
> 2 files changed, 399 insertions(+)
> create mode 100644 contrib/plugins/cache.c
>
> diff --git a/contrib/plugins/Makefile b/contrib/plugins/Makefile
> index b9d7935e5e..2237b47f8b 100644
> --- a/contrib/plugins/Makefile
> +++ b/contrib/plugins/Makefile
> @@ -18,6 +18,7 @@ NAMES += hotpages
> NAMES += howvec
> NAMES += lockstep
> NAMES += hwprofile
> +NAMES += cache
>
> SONAMES := $(addsuffix .so,$(addprefix lib,$(NAMES)))
>
> diff --git a/contrib/plugins/cache.c b/contrib/plugins/cache.c
> new file mode 100644
> index 0000000000..f8c15ebed2
> --- /dev/null
> +++ b/contrib/plugins/cache.c
> @@ -0,0 +1,398 @@
> +/*
> + * Copyright (C) 2021, Mahmoud Mandour <ma.mandourr@gmail.com>
> + *
> + * License: GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include <inttypes.h>
> +#include <assert.h>
> +#include <stdlib.h>
> +#include <inttypes.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <stdio.h>
> +#include <glib.h>
> +
> +#include <qemu-plugin.h>
> +
> +QEMU_PLUGIN_EXPORT int qemu_plugin_version = QEMU_PLUGIN_VERSION;
> +
> +static GRand *rng;
> +static GHashTable *dmiss_ht;
> +static GHashTable *imiss_ht;
> +
> +static GMutex dmtx, imtx;
> +
> +static int limit;
> +static bool sys;
> +
> +static uint64_t dmem_accesses;
> +static uint64_t dmisses;
> +
> +static uint64_t imem_accesses;
> +static uint64_t imisses;
> +
> +static enum qemu_plugin_mem_rw rw = QEMU_PLUGIN_MEM_RW;
> +
> +enum AccessResult {
> + HIT = 0,
> + MISS = 1
> +};
> +
> +struct InsnData {
> + char *disas_str;
> + uint64_t addr;
> + uint64_t misses;
> +};

A little commentary to the relationship between CacheSet and CacheBlock
would be useful here for those trying to follow the code. Maybe a little
ascii art if your up to it?

> +
> +struct CacheBlock {
> + uint64_t tag;
> + bool valid;
> +};
> +
> +struct CacheSet {
> + struct CacheBlock *blocks;
> +};
> +
> +struct Cache {
> + struct CacheSet *sets;
> + int num_sets;
> +
> + int cachesize;
> + int blksize;
> + int assoc;
> +
> + uint64_t blk_mask;
> + uint64_t set_mask;
> + uint64_t tag_mask;
> +};
> +
> +struct Cache *dcache, *icache;
> +
> +static int pow_of_two(int num)
> +{
> + g_assert((num & (num - 1)) == 0);
> + int ret = 0;
> + while (num /= 2) {
> + ret++;
> + }
> + return ret;
> +}

You could probably eliminate this by:

a) pre-calculating masks and shifts at start-up
b) expressing cache-size as a power of 2 (are caches ever not?)

Currently it is by far the biggest hit on the CPU:

46.42% qemu-aarch64 libcache.so [.] pow_of_two
16.71% qemu-aarch64 libcache.so [.] lru_update_on_hit
14.12% qemu-aarch64 libcache.so [.] in_cache
6.73% qemu-aarch64 libcache.so [.] extract_tag
4.52% qemu-aarch64 libcache.so [.] extract_set
4.48% qemu-aarch64 libcache.so [.] access_cache
2.34% qemu-aarch64 libcache.so [.] vcpu_insn_exec
1.63% qemu-aarch64 libcache.so [.] vcpu_mem_access
0.72% qemu-aarch64 libglib-2.0.so.0.5800.3 [.] g_mutex_lock

> +static inline uint64_t extract_tag(struct Cache *cache, uint64_t addr)
> +{
> + return (addr & cache->tag_mask) >>
> + (pow_of_two(cache->num_sets) + pow_of_two(cache->blksize));
> +}
> +
> +static inline uint64_t extract_set(struct Cache *cache, uint64_t addr)
> +{
> + return (addr & cache->set_mask) >> (pow_of_two(cache->blksize));
> +}

It would make sense to enforce pow_of_two for num_sets and blksize on
initialisation to avoid doing it for every tag. Maybe rename them to
set_shift and blksize_shift to better indicate their usage.

That's of course problematic, I guess I need to precompute it once

for each cache and not for each tag extraction. However, I think that I got this

wrong since I do not even need to shift down the tag. I can extract the tag as

the high-order bits by only masking for every access and also store that in the

cache, which works fine. pow_of_two would still propose a problem since I

need to compute it for extract_set, so I can store that once in the cache, yes.

Also, can you please tell me what do you use to access plugin profiling data? To my

knowledge, gprof is not able to work out profiling data for loaded shared libraries.

I tried sprof but I'm getting an error caused by a seemingly-known bug since 2009

and not yet solved :D

I guess callgrind works but I think its output is not as clean as the output you posted.

> +
> +static struct Cache *cache_init(int blksize, int assoc, int cachesize)
> +{
> + struct Cache *cache;
> + int i;
> +
> + cache = g_new(struct Cache, 1);
> + cache->blksize = blksize;
> + cache->assoc = assoc;
> + cache->cachesize = cachesize;
> + cache->num_sets = cachesize / (blksize * assoc);
> + cache->sets = g_new(struct CacheSet, cache->num_sets);
> +
> + for (i = 0; i < cache->num_sets; i++) {
> + cache->sets[i].blocks = g_new0(struct CacheBlock, assoc);
> + }
> +
> + cache->blk_mask = blksize - 1;
> + cache->set_mask = ((cache->num_sets - 1) << (pow_of_two(cache->blksize)));
> + cache->tag_mask = ~(cache->set_mask | cache->blk_mask);
> +
> + return cache;
> +}
> +
> +static int get_invalid_block(struct Cache *cache, uint64_t set)
> +{
> + int i;
> +
> + for (i = 0; i < cache->assoc; i++) {
> + if (!cache->sets[set].blocks[i].valid) {
> + /* conflict miss */
> + return i;
> + }
> + }
> +
> + /* compulsary miss */
> + return -1;
> +}
> +
> +static int get_replaced_block(struct Cache *cache)
> +{
> + return g_rand_int_range(rng, 0, cache->assoc);
> +}
> +
> +static bool in_cache(struct Cache *cache, uint64_t addr)
> +{
> + int i;
> + uint64_t tag, set;
> +
> + tag = extract_tag(cache, addr);
> + set = extract_set(cache, addr);
> +
> + for (i = 0; i < cache->assoc; i++) {
> + if (cache->sets[set].blocks[i].tag == tag &&
> + cache->sets[set].blocks[i].valid) {
> + return true;
> + }
> + }
> +
> + return false;
> +}
> +
> +static enum AccessResult access_cache(struct Cache *cache, uint64_t
> addr)

Does the enum really make things easier compared to a straight bool? You
could just explain things with a comment:

/**
* access_cache() - simulate a cache access
* @cache: reference to the cache being used
* @addr: address of cached entity
*
* Returns true if the cache hit, false otherwise and the cache is
* updated for next time.
*/
static bool access_cache(struct Cache *cache, uint64_t addr)
{

And then have:

if (!access_cache(icache, addr)) {
struct InsnData *insn = get_or_create(imiss_ht, userdata, addr);
insn->misses++;
imisses++;
}

I applied that, probably better. Thanks.

> +{
> + uint64_t tag, set;
> + int replaced_blk;
> +
> + if (in_cache(cache, addr)) {
> + return HIT;
> + }
> +
> + tag = extract_tag(cache, addr);
> + set = extract_set(cache, addr);
> +
> + replaced_blk = get_invalid_block(cache, set);
> +
> + if (replaced_blk == -1) {
> + replaced_blk = get_replaced_block(cache);
> + }
> +
> + cache->sets[set].blocks[replaced_blk].tag = tag;
> + cache->sets[set].blocks[replaced_blk].valid = true;
> +
> + return MISS;
> +}
> +
> +struct InsnData *get_or_create(GHashTable *ht, struct InsnData *insn_data,
> + uint64_t addr)
> +{
> + struct InsnData *insn = g_hash_table_lookup(ht, GUINT_TO_POINTER(addr));
> + if (!insn) {
> + g_hash_table_insert(ht, GUINT_TO_POINTER(addr), (gpointer) insn_data);
> + insn = insn_data;
> + }
> +
> + return insn;
> +}
> +
> +static void vcpu_mem_access(unsigned int cpu_index, qemu_plugin_meminfo_t info,
> + uint64_t vaddr, void *userdata)
> +{
> + uint64_t insn_addr;
> + uint64_t effective_addr;
> + struct qemu_plugin_hwaddr *hwaddr;
> +
> + g_mutex_lock(&dmtx);
> + hwaddr = qemu_plugin_get_hwaddr(info, vaddr);
> + if (hwaddr && qemu_plugin_hwaddr_is_io(hwaddr)) {
> + g_free(userdata);
> + return;
> + }
> +
> + insn_addr = ((struct InsnData *) userdata)->addr;
> + effective_addr = hwaddr ? qemu_plugin_hwaddr_phys_addr(hwaddr) : vaddr;
> +
> + if (access_cache(dcache, effective_addr) == MISS) {
> + struct InsnData *insn = get_or_create(dmiss_ht, userdata, insn_addr);
> + insn->misses++;
> + dmisses++;
> + }
> + dmem_accesses++;
> + g_mutex_unlock(&dmtx);
> +}
> +
> +static void vcpu_insn_exec(unsigned int vcpu_index, void *userdata)
> +{
> + uint64_t addr;
> +
> + g_mutex_lock(&imtx);
> + addr = ((struct InsnData *) userdata)->addr;
> +
> + if (access_cache(icache, addr) == MISS) {
> + struct InsnData *insn = get_or_create(imiss_ht, userdata, addr);
> + insn->misses++;
> + imisses++;
> + }
> + imem_accesses++;
> + g_mutex_unlock(&imtx);
> +}
> +
> +static void vcpu_tb_trans(qemu_plugin_id_t id, struct qemu_plugin_tb *tb)
> +{
> + size_t n_insns;
> + size_t i;
> +
> + n_insns = qemu_plugin_tb_n_insns(tb);
> + for (i = 0; i < n_insns; i++) {
> + struct qemu_plugin_insn *insn = qemu_plugin_tb_get_insn(tb, i);
> + uint64_t effective_addr;
> +
> + if (sys) {
> + effective_addr = (uint64_t) qemu_plugin_insn_haddr(insn);
> + } else {
> + effective_addr = (uint64_t) qemu_plugin_insn_vaddr(insn);
> + }
> +
> + struct InsnData *ddata = g_new(struct InsnData, 1);
> + struct InsnData *idata = g_new(struct InsnData, 1);

OK I think I see what you where saying on the sync up earlier. You need
to take into account any given instruction may get translated multiple
times so I think for any given instruction you are tracking you want to
get or create an entry here.

Thanks for explaining that instruction can get translated multiple

times, I did not know that. Applied, thanks.

> +
> + ddata->disas_str = qemu_plugin_insn_disas(insn);
> + ddata->misses = 0;
> + ddata->addr = effective_addr;
> +
> + idata->disas_str = g_strdup(ddata->disas_str);
> + idata->misses = 0;
> + idata->addr = effective_addr;

And you might as well combine the InsnData to track both data and icache
misses in one structure to avoid the duplication of records and strings.

Applied, thanks.

> +
> + qemu_plugin_register_vcpu_mem_cb(insn, vcpu_mem_access,
> + QEMU_PLUGIN_CB_NO_REGS,
> + rw, ddata);
> +
> + qemu_plugin_register_vcpu_insn_exec_cb(insn, vcpu_insn_exec,
> + QEMU_PLUGIN_CB_NO_REGS, idata);
> + }
> +}
> +
> +static void print_entry(gpointer data)
> +{
> + struct InsnData *insn = (struct InsnData *) data;
> + g_autoptr(GString) xx = g_string_new("");
> + g_string_append_printf(xx, "0x%" PRIx64 ": %s - misses: %lu\n",
> + insn->addr, insn->disas_str, insn->misses);

As you are likely going to want to post-process this data I would
suggest a slightly more machine readable format:

address, misses, instruction
0x419298, 2, mov x0, x21
0x41aa40, 2, add x0, x0, #0x17
0x419640, 2, add x5, x4, #0x218
0x41aa10, 2, adrp x1, #0x48b000
0x4002d0, 2, adrp x16, #0x48b000

Applied, thanks.

> + qemu_plugin_outs(xx->str);
> +}
> +
> +static void free_insn(gpointer data)
> +{
> + struct InsnData *insn = (struct InsnData *) data;
> + g_free(insn->disas_str);
> + g_free(insn);
> +}
> +
> +static void free_cache(struct Cache *cache)
> +{
> + for (int i = 0; i < cache->num_sets; i++) {
> + g_free(cache->sets[i].blocks);
> + }
> +
> + g_free(cache->sets);
> +}
> +
> +static int cmp(gconstpointer a, gconstpointer b)

This will likely need renaming if you ever want to sort by different
things.

Yes, I will have two comparator functions, one for data misses and one

for instruction misses.

> +{
> + struct InsnData *insn_a = (struct InsnData *) a;
> + struct InsnData *insn_b = (struct InsnData *) b;
> +
> + return insn_a->misses < insn_b->misses ? 1 : -1;
> +}
> +
> +static void print_stats()
> +{
> + g_autoptr(GString) rep = g_string_new("");
> + g_string_append_printf(rep,
> + "Data accesses: %lu, Misses: %lu\nMiss rate: %lf%%\n\n",
> + dmem_accesses,
> + dmisses,
> + ((double)dmisses / dmem_accesses) * 100.0);
> +
> + g_string_append_printf(rep,
> + "Instruction accesses: %lu, Misses: %lu\nMiss rate: %lf%%\n\n",
> + imem_accesses,
> + imisses,
> + ((double)imisses / imem_accesses) * 100.0);
> +
> + qemu_plugin_outs(rep->str);
> +}
> +
> +static void plugin_exit()
> +{
> + GList *curr;
> + int i;
> +
> + g_mutex_lock(&imtx);
> + g_mutex_lock(&dmtx);
> + GList *dmiss_insns = g_hash_table_get_values(dmiss_ht);
> + GList *imiss_insns = g_hash_table_get_values(imiss_ht);
> + dmiss_insns = g_list_sort(dmiss_insns, cmp);
> + imiss_insns = g_list_sort(imiss_insns, cmp);
> +
> + print_stats();
> +
> + qemu_plugin_outs("Most data-missing instructions\n");
> + for (curr = dmiss_insns, i = 0; curr && i < limit; i++, curr = curr->next) {
> + print_entry(curr->data);
> + }
> +
> + qemu_plugin_outs("\nMost fetch-missing instructions\n");
> + for (curr = imiss_insns, i = 0; curr && i < limit; i++, curr = curr->next) {
> + print_entry(curr->data);
> + }
> +
> + free_cache(dcache);
> + free_cache(icache);
> +
> + g_list_free(dmiss_insns);
> + g_list_free(imiss_insns);
> +
> + g_hash_table_destroy(dmiss_ht);
> + g_hash_table_destroy(imiss_ht);
> +
> + g_mutex_unlock(&dmtx);
> + g_mutex_unlock(&imtx);
> +}
> +
> +QEMU_PLUGIN_EXPORT
> +int qemu_plugin_install(qemu_plugin_id_t id, const qemu_info_t *info,
> + int argc, char **argv)
> +{
> + int i;
> + int iassoc, iblksize, icachesize;
> + int dassoc, dblksize, dcachesize;
> +
> + limit = 32;
> + sys = info->system_emulation;
> +
> + dassoc = 8;
> + dblksize = 64;
> + dcachesize = dblksize * dassoc * 32;
> +
> + iassoc = 8;
> + iblksize = 64;
> + icachesize = iblksize * iassoc * 32;
> +
> + rng = g_rand_new();
> +
> + for (i = 0; i < argc; i++) {
> + char *opt = argv[i];
> + if (g_str_has_prefix(opt, "limit=")) {
> + limit = g_ascii_strtoull(opt + 6, NULL, 10);
> + } else {
> + fprintf(stderr, "option parsing failed: %s\n", opt);
> + return -1;
> + }
> + }
> +
> + dcache = cache_init(dblksize, dassoc, dcachesize);
> + icache = cache_init(iblksize, iassoc, icachesize);
> +
> + qemu_plugin_register_vcpu_tb_trans_cb(id, vcpu_tb_trans);
> + qemu_plugin_register_atexit_cb(id, plugin_exit, NULL);
> +
> + dmiss_ht = g_hash_table_new_full(NULL, g_direct_equal, NULL, free_insn);
> + imiss_ht = g_hash_table_new_full(NULL, g_direct_equal, NULL, free_insn);
> +
> + return 0;
> +}

--
Alex Bennée

From:	Mahmoud Mandour
Subject:	Re: [RFC PATCH 1/3] plugins: Added a new cache modelling plugin
Date:	Wed, 2 Jun 2021 05:51:41 +0200