qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v3 1/5] target/riscv: add vector unit stride load and store i


From: LIU Zhiwei
Subject: Re: [PATCH v3 1/5] target/riscv: add vector unit stride load and store instructions
Date: Wed, 19 Feb 2020 16:57:43 +0800
User-agent: Mozilla/5.0 (Windows NT 10.0; WOW64; rv:68.0) Gecko/20100101 Thunderbird/68.4.1

Hi, Richard
Thanks for your informative comments. I'm addressing these comments.
And a little confused in some comments.
On 2020/2/12 14:38, Richard Henderson wrote:
On 2/9/20 11:42 PM, LIU Zhiwei wrote:
+/*
+ * As simd_desc supports at most 256 bytes, and in this implementation,
+ * the max vector group length is 2048 bytes. So split it into two parts.
+ *
+ * The first part is floor(maxsz, 64), encoded in maxsz of simd_desc.
+ * The second part is (maxsz % 64) >> 3, encoded in data of simd_desc.
+ */
+static uint32_t maxsz_part1(uint32_t maxsz)
+{
+    return ((maxsz & ~(0x3f)) >> 3) + 0x8; /* add offset 8 to avoid return 0 */
+}
+
+static uint32_t maxsz_part2(uint32_t maxsz)
+{
+    return (maxsz & 0x3f) >> 3;
+}
I would much rather adjust simd_desc to support 2048 bytes.

I've just posted a patch set that removes an assert in target/arm that would
trigger if SIMD_DATA_SHIFT was increased to make room for a larger oprsz.

Or, since we're not going through tcg_gen_gvec_* for ldst, don't bother with
simd_desc at all, and just pass vlen, unencoded.

+/* define check conditions data structure */
+struct vext_check_ctx {
+
+    struct vext_reg {
+        uint8_t reg;
+        bool widen;
+        bool need_check;
+    } check_reg[6];
+
+    struct vext_overlap_mask {
+        uint8_t reg;
+        uint8_t vm;
+        bool need_check;
+    } check_overlap_mask;
+
+    struct vext_nf {
+        uint8_t nf;
+        bool need_check;
+    } check_nf;
+    target_ulong check_misa;
+
+} vchkctx;
You cannot use a global variable.  The data must be thread-safe.

If we're going to do the checks this way, with a structure, it needs to be on
the stack or within DisasContext.

+#define GEN_VEXT_LD_US_TRANS(NAME, DO_OP, SEQ)                            \
+static bool trans_##NAME(DisasContext *s, arg_r2nfvm* a)                  \
+{                                                                         \
+    vchkctx.check_misa = RVV;                                             \
+    vchkctx.check_overlap_mask.need_check = true;                         \
+    vchkctx.check_overlap_mask.reg = a->rd;                               \
+    vchkctx.check_overlap_mask.vm = a->vm;                                \
+    vchkctx.check_reg[0].need_check = true;                               \
+    vchkctx.check_reg[0].reg = a->rd;                                     \
+    vchkctx.check_reg[0].widen = false;                                   \
+    vchkctx.check_nf.need_check = true;                                   \
+    vchkctx.check_nf.nf = a->nf;                                          \
+                                                                          \
+    if (!vext_check(s)) {                                                 \
+        return false;                                                     \
+    }                                                                     \
+    return DO_OP(s, a, SEQ);                                              \
+}
I don't see the improvement from a pointer.  Something like

    if (vext_check_isa_ill(s) &&
        vext_check_overlap(s, a->rd, a->rm) &&
        vext_check_reg(s, a->rd, false) &&
        vext_check_nf(s, a->nf)) {
        return DO_OP(s, a, SEQ);
    }
    return false;

seems just as clear without the extra data.

+#ifdef CONFIG_USER_ONLY
+#define MO_SB 0
+#define MO_LESW 0
+#define MO_LESL 0
+#define MO_LEQ 0
+#define MO_UB 0
+#define MO_LEUW 0
+#define MO_LEUL 0
+#endif
What is this for?  We already define these unconditionally.


+static inline int vext_elem_mask(void *v0, int mlen, int index)
+{
+    int idx = (index * mlen) / 8;
+    int pos = (index * mlen) % 8;
+
+    return (*((uint8_t *)v0 + idx) >> pos) & 0x1;
+}
This is a little-endian indexing of the mask.  Just above we talk about using a
host-endian ordering of uint64_t.

Thus this must be based on uint64_t instead of uint8_t.

+/*
+ * This function checks watchpoint before really load operation.
+ *
+ * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
+ * In user mode, there is no watchpoint support now.
+ *
+ * It will triggle an exception if there is no mapping in TLB
+ * and page table walk can't fill the TLB entry. Then the guest
+ * software can return here after process the exception or never return.
+ */
+static void probe_read_access(CPURISCVState *env, target_ulong addr,
+        target_ulong len, uintptr_t ra)
+{
+    while (len) {
+        const target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
+        const target_ulong curlen = MIN(pagelen, len);
+
+        probe_read(env, addr, curlen, cpu_mmu_index(env, false), ra);
The return value here is non-null when we can read directly from host memory.
It would be a shame to throw that work away.


+/* data structure and common functions for load and store */
+typedef void vext_ld_elem_fn(CPURISCVState *env, target_ulong addr,
+        uint32_t idx, void *vd, uintptr_t retaddr);
+typedef void vext_st_elem_fn(CPURISCVState *env, target_ulong addr,
+        uint32_t idx, void *vd, uintptr_t retaddr);
+typedef target_ulong vext_get_index_addr(target_ulong base,
+        uint32_t idx, void *vs2);
+typedef void vext_ld_clear_elem(void *vd, uint32_t idx,
+        uint32_t cnt, uint32_t tot);
+
+struct vext_ldst_ctx {
+    struct vext_common_ctx vcc;
+    uint32_t nf;
+    target_ulong base;
+    target_ulong stride;
+    int mmuidx;
+
+    vext_ld_elem_fn *ld_elem;
+    vext_st_elem_fn *st_elem;
+    vext_get_index_addr *get_index_addr;
+    vext_ld_clear_elem *clear_elem;
+};
I think you should pass these elements directly, as needed, rather than putting
them all in a struct.

This would allow the main helper function to be inlined, which in turn allows
the mini helper functions to be inlined.
1. What's the main helper function? What's is the mini helper functions here?

I guess main helper function is such code like:

#define GEN_VEXT_LD_UNIT_STRIDE(NAME, MTYPE, ETYPE)                \
void HELPER(NAME##_mask)(void *vd, target_ulong base, void *v0,   \
        CPURISCVState *env, uint32_t desc)                                              \
{                                                                                                              \
    static struct vext_ldst_ctx ctx;                                                             \
    vext_common_ctx_init(&ctx.vcc, sizeof(ETYPE),                                 \
        sizeof(MTYPE), env->vl, desc);                                                       \
    ctx.nf = vext_nf(desc);                                                                        \
    ctx.base = base;                                                                                  \
    ctx.ld_elem = vext_##NAME##_ld_elem;                                           \
    ctx.clear_elem = vext_##NAME##_clear_elem;                                  \
                                                                                                               \
    vext_ld_unit_stride_mask(vd, v0, env, &ctx, GETPC());                       \
}                                                                                                              \
And the mini helper function is such code like:
static void vext_ld_unit_stride_mask(void *vd, void *v0, CPURISCVState *env,
        struct vext_ldst_ctx *ctx, uintptr_t ra)
{
    uint32_t i, k;
    struct vext_common_ctx *s = &ctx->vcc;

    if (s->vl == 0) {
        return;
    }
    /* probe every access*/
    for (i = 0; i < s->vl; i++) {
        if (!s->vm && !vext_elem_mask(v0, s->mlen, i)) {
            continue;
        }
        probe_read_access(env, ctx->base + ctx->nf * i * s->msz,
                ctx->nf * s->msz, ra);
    }
    /* load bytes from guest memory */
    for (i = 0; i < s->vl; i++) {
        k = 0;
        if (!s->vm && !vext_elem_mask(v0, s->mlen, i)) {
            continue;
        }
        while (k < ctx->nf) {
            target_ulong addr = ctx->base + (i * ctx->nf + k) * s->msz;
            ctx->ld_elem(env, addr, i + k * s->vlmax, vd, ra);
            k++;
        }
    }
    /* clear tail elements */
    for (k = 0; k < ctx->nf; k++) {
        ctx->clear_elem(vd, s->vl + k * s->vlmax, s->vl * s->esz,
                s->vlmax * s->esz);
    }
}
Is it right?
2.  The number of  parameters grows a lot when pass directly to mini helper functions.

For example, the number of parameters increases from 5 to 11.

static void vext_ld_unit_stride(void *vd, target_ulong base, void *v0,
        CPURISCVState *env,  vext_ld_elem_fn *ld_elem,
        vext_ld_clear_elem *clear_elem,uint32_t vlmax,
        uint32_t nf, uint32_t esz, uint32_t msz, uintptr_t ra)

As vlmax and nf can be extracted  from desc, another form of mini helper
will increases the number of parameters from 5 to 10.

Is it OK?

BTW: In this patchset, I use a lot macros to generate code.  Is it OK?

Best Regards,
Zhiwei

r~


reply via email to

[Prev in Thread] Current Thread [Next in Thread]