qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH 1/2] target/mips: Optimize ILVOD.<B|H|W|D> MSA instr


From: Mateja Marjanovic
Subject: [Qemu-devel] [PATCH 1/2] target/mips: Optimize ILVOD.<B|H|W|D> MSA instructions
Date: Fri, 15 Mar 2019 13:02:47 +0100

From: Mateja Marjanovic <address@hidden>

Optimize set of MSA instructions ILVOD, using directly
tcg registers and performing logic on them insted of
using helpers.
Performance measurement is done by executing the
instructions large number of times on a computer
with Intel Core i7-3770 CPU @ 3.40GHz×8.

 instruction ||    before    ||    after   ||
==============================================
 ilvod.b:        ||   66.97 ms   ||  26.34 ms  ||
 ilvod.h:        ||   44.75 ms   ||  25.17 ms  ||
 ilvod.w:        ||   41.27 ms   ||  24.37 ms  ||
 ilvod.d:        ||   41.75 ms   ||  20.50 ms  ||

Signed-off-by: Mateja Marjanovic <address@hidden>
---
 target/mips/helper.h     |   1 -
 target/mips/msa_helper.c |  51 --------------------
 target/mips/translate.c  | 119 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 118 insertions(+), 53 deletions(-)

diff --git a/target/mips/helper.h b/target/mips/helper.h
index a6d687e..d162836 100644
--- a/target/mips/helper.h
+++ b/target/mips/helper.h
@@ -865,7 +865,6 @@ DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32)
-DEF_HELPER_5(msa_ilvod_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
index 9d9dafe..cbcfd57 100644
--- a/target/mips/msa_helper.c
+++ b/target/mips/msa_helper.c
@@ -1363,57 +1363,6 @@ void helper_msa_ilvev_df(CPUMIPSState *env, uint32_t df, 
uint32_t wd,
     }
 }
 
-void helper_msa_ilvod_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
-                         uint32_t ws, uint32_t wt)
-{
-    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
-    wr_t *pws = &(env->active_fpu.fpr[ws].wr);
-    wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
-
-    switch (df) {
-    case DF_BYTE:
-        pwd->b[0]  = pwt->b[1];
-        pwd->b[1]  = pws->b[1];
-        pwd->b[2]  = pwt->b[3];
-        pwd->b[3]  = pws->b[3];
-        pwd->b[4]  = pwt->b[5];
-        pwd->b[5]  = pws->b[5];
-        pwd->b[6]  = pwt->b[7];
-        pwd->b[7]  = pws->b[7];
-        pwd->b[8]  = pwt->b[9];
-        pwd->b[9]  = pws->b[9];
-        pwd->b[10] = pwt->b[11];
-        pwd->b[11] = pws->b[11];
-        pwd->b[12] = pwt->b[13];
-        pwd->b[13] = pws->b[13];
-        pwd->b[14] = pwt->b[15];
-        pwd->b[15] = pws->b[15];
-        break;
-    case DF_HALF:
-        pwd->h[0] = pwt->h[1];
-        pwd->h[1] = pws->h[1];
-        pwd->h[2] = pwt->h[3];
-        pwd->h[3] = pws->h[3];
-        pwd->h[4] = pwt->h[5];
-        pwd->h[5] = pws->h[5];
-        pwd->h[6] = pwt->h[7];
-        pwd->h[7] = pws->h[7];
-        break;
-    case DF_WORD:
-        pwd->w[0] = pwt->w[1];
-        pwd->w[1] = pws->w[1];
-        pwd->w[2] = pwt->w[3];
-        pwd->w[3] = pws->w[3];
-        break;
-    case DF_DOUBLE:
-        pwd->d[0] = pwt->d[1];
-        pwd->d[1] = pws->d[1];
-        break;
-    default:
-        assert(0);
-    }
-}
-
 void helper_msa_ilvl_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
                         uint32_t ws, uint32_t wt)
 {
diff --git a/target/mips/translate.c b/target/mips/translate.c
index b4a1103..101d2de 100644
--- a/target/mips/translate.c
+++ b/target/mips/translate.c
@@ -28889,6 +28889,108 @@ static void gen_msa_bit(CPUMIPSState *env, 
DisasContext *ctx)
     tcg_temp_free_i32(tws);
 }
 
+static inline void gen_ilvod_b(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt) {
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+
+    uint64_t mask = (1ULL << 8) - 1;
+    mask |= mask << 16;
+    mask |= mask << 32;
+    mask <<= 8;
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
+    tcg_gen_shri_i64(t0, t0, 8);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
+
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
+    tcg_gen_shri_i64(t0, t0, 8);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
+
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
+static inline void gen_ilvod_h(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt) {
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+
+    uint64_t mask = (1ULL << 16) - 1;
+    mask |= mask << 32;
+    mask <<= 16;
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
+    tcg_gen_shri_i64(t0, t0, 16);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
+
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
+    tcg_gen_shri_i64(t0, t0, 16);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
+
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
+static inline void gen_ilvod_w(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt) {
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+
+    uint64_t mask = (1ULL << 32) - 1;
+    tcg_gen_movi_i64(t1, 0);
+
+    mask <<= 32;
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
+    tcg_gen_shri_i64(t0, t0, 32);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
+
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
+    tcg_gen_shri_i64(t0, t0, 32);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
+
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
+static inline void gen_ilvod_d(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt) {
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2 + 1]);
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2 + 1]);
+}
+
 static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
 {
 #define MASK_MSA_3R(op)    (MASK_MSA_MINOR(op) | (op & (0x7 << 23)))
@@ -29060,7 +29162,22 @@ static void gen_msa_3r(CPUMIPSState *env, DisasContext 
*ctx)
         gen_helper_msa_mod_u_df(cpu_env, tdf, twd, tws, twt);
         break;
     case OPC_ILVOD_df:
-        gen_helper_msa_ilvod_df(cpu_env, tdf, twd, tws, twt);
+        switch (df) {
+        case DF_BYTE:
+            gen_ilvod_b(env, wd, ws, wt);
+            break;
+        case DF_HALF:
+            gen_ilvod_h(env, wd, ws, wt);
+            break;
+        case DF_WORD:
+            gen_ilvod_w(env, wd, ws, wt);
+            break;
+        case DF_DOUBLE:
+            gen_ilvod_d(env, wd, ws, wt);
+            break;
+        default:
+            assert(0);
+        }
         break;
 
     case OPC_DOTP_S_df:
-- 
2.7.4




reply via email to

[Prev in Thread] Current Thread [Next in Thread]