qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] sh : performance problem


From: Shin-ichiro KAWASAKI
Subject: [Qemu-devel] sh : performance problem
Date: Fri, 27 Feb 2009 01:28:07 +0900
User-agent: Thunderbird 2.0.0.19 (Windows/20081209)

Hi, all.

One of the current problems of qemu-sh4 system emulation is performance.
Kernel boot process seems not so slow, but userland process performance
is too bad.  This problem can be seen numerically when you compile simple '.c'
source with empty main().  The result is as follows.

  sh4 : 5.8 [seconds]     (-M r2d + Fedora 6)
  arm : 0.8 [seconds]     (-M versatilepb + Debian ARM)

sh4 is 7 times slower than arm...

 /* I repeated the compile work until the measured time converges. */


Using Oprofile, we can see what qemu doing while the compile work on sh4
system emulation.  Top 20 time consuming functions are as follows.

---------------------------------------------------------------------------
CPU: Core 2, speed 1600 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask 
of 0x00 (Unhalted core cycles) count 100000
samples  %        image name               app name                 symbol name
38481    25.0550  qemu-system-sh4          qemu-system-sh4          
find_tlb_entry
19836    12.9152  anon (tgid:2483 range:0xb2181000-0xb3381000) qemu-system-sh4  
        (no symbols)
9333      6.0767  qemu-system-sh4          qemu-system-sh4          cpu_sh4_exec
8180      5.3260  qemu-system-sh4          qemu-system-sh4          
tcg_reg_alloc_op
7236      4.7114  qemu-system-sh4          qemu-system-sh4          temp_save
7051      4.5909  qemu-system-sh4          qemu-system-sh4          
tcg_liveness_analysis
4339      2.8251  libc-2.8.90.so           libc-2.8.90.so           (no symbols)
3414      2.2229  qemu-system-sh4          qemu-system-sh4          
tlb_set_page_exec
3017      1.9644  qemu-system-sh4          qemu-system-sh4          decode_opc
2802      1.8244  qemu-system-sh4          qemu-system-sh4          
get_physical_address
sa2358      1.5353  qemu-system-sh4          qemu-system-sh4          
find_itlb_entry
2215      1.4422  qemu-system-sh4          qemu-system-sh4          
cpu_sh4_write_mmaped_utlb_addr
1771      1.1531  qemu-system-sh4          qemu-system-sh4          
tcg_gen_code_search_pc
1746      1.1368  qemu-system-sh4          qemu-system-sh4          __ldl_mmu
1663      1.0828  qemu-system-sh4          qemu-system-sh4          
cpu_sh4_handle_mmu_fault
1557      1.0138  qemu-system-sh4          qemu-system-sh4          
gen_intermediate_code_pc
1328      0.8647  qemu-system-sh4          qemu-system-sh4          
tcg_temp_new_internal_i32
1233      0.8028  qemu-system-arm          qemu-system-arm          cpu_arm_exec
1229      0.8002  vmlinux                  vmlinux                  vmi_activate
1046      0.6811  Xvnc4                    Xvnc4                    (no symbols)
---------------------------------------------------------------------------

Most time consuming function is 'find_tlb_entry()', which search the sh4's TLB
entries to find an entry which matches with given address.
If my understanding is right, this search happens when TLB miss happens.
Too many TLB misses causes bad perfomance, I guess.

The actions to solve this problem will be,

 (i)   tune up 'find_tlb_entry()'
 (ii)  reduce TLB miss by expanding page size
 (iii) reduce TLB miss by increase the number of TLB entries virtually,
       more than the real cpu has.

First, I tried (i).  The attached patch introduces binary search.
It shortens the gcc compile time from 5.8 seconds to 4.6 seconds :
it make sh4 system emulation 20% faster.

'find_tlb_entry()' searches unified tlb array with 64 entries, and
instruction tlb array with 4 entries.  This patch focus only on unified
tlb array search.

This patch is rather rough one.  Any advise to brush it up will be appreciated.
I'm going to work on approaches (ii) and (iii).  Advises for them will be
thanked too.

Regards,
Shin-ichiro KAWASAKI


Index: trunk/target-sh4/cpu.h
===================================================================
--- trunk/target-sh4/cpu.h      (revision 6628)
+++ trunk/target-sh4/cpu.h      (working copy)
@@ -137,13 +137,16 @@
     uint32_t intevt;           /* interrupt event register */
 
     uint32_t pvr;              /* Processor Version Register */
     uint32_t prr;              /* Processor Revision Register */
     uint32_t cvr;              /* Cache Version Register */
 
-     CPU_COMMON tlb_t utlb[UTLB_SIZE]; /* unified translation table */
+    CPU_COMMON
+    tlb_t utlb[UTLB_SIZE];     /* unified translation table */
+    tlb_t * sorted_utlb[UTLB_SIZE];
+    uint32_t sorted_utlb_num;
     tlb_t itlb[ITLB_SIZE];     /* instruction translation table */
     void *intc_handle;
     int intr_at_halt;          /* SR_BL ignored during sleep */
 } CPUSH4State;
 
 CPUSH4State *cpu_sh4_init(const char *cpu_model);
Index: trunk/target-sh4/helper.c
===================================================================
--- trunk/target-sh4/helper.c   (revision 6628)
+++ trunk/target-sh4/helper.c   (working copy)
@@ -322,7 +322,7 @@
     if (e == MMU_DTLB_MULTIPLE)
        e = MMU_ITLB_MULTIPLE;
     else if (e == MMU_DTLB_MISS && update) {
-       e = find_tlb_entry(env, address, env->utlb, UTLB_SIZE, use_asid);
+       e = find_utlb_entry(env, address, use_asid);
        if (e >= 0) {
            tlb_t * ientry;
            n = itlb_replacement(env);
@@ -342,15 +342,69 @@
     return e;
 }
 
+static inline int is_utlb_match(tlb_t * e, uint32_t addr)
+{
+    return (e->vpn << 10 & ~(e->size - 1)) == (addr & ~(e->size - 1));
+}
+
 /* Find utlb entry
    Return entry, MMU_DTLB_MISS, MMU_DTLB_MULTIPLE */
 int find_utlb_entry(CPUState * env, target_ulong address, int use_asid)
 {
+    int min = 0;
+    int max = env->sorted_utlb_num - 1;
+    int cur = (min + max) / 2 ;
+    int save;
+    uint8_t asid = env->pteh & 0xff;
+    int ret = MMU_DTLB_MISS;
+
     /* per utlb access */
     increment_urc(env);
 
-    /* Return entry */
-    return find_tlb_entry(env, address, env->utlb, UTLB_SIZE, use_asid);
+    if (env->sorted_utlb_num <= 0)
+       return MMU_DTLB_MISS;
+
+    /* binary search */
+    while (!is_utlb_match(env->sorted_utlb[cur], address)) {
+       if (min >= max)
+           return MMU_DTLB_MISS;
+       if (((env->sorted_utlb[cur]->vpn << 10)
+           & ~(env->sorted_utlb[cur]->size - 1)) <
+           (address & ~(env->sorted_utlb[cur]->size - 1))) {
+           min = cur + 1;
+       } else {
+           max = cur - 1;
+       }
+       cur = (min + max) / 2;
+    }
+
+    save = cur;
+
+    /* minus search */
+    do {
+       tlb_t * e = env->sorted_utlb[cur];
+       if (e->sh || (!use_asid) || e->asid == asid) {
+           if (ret != MMU_DTLB_MISS)
+               return MMU_DTLB_MULTIPLE;
+           ret = e - &env->utlb[0];
+       }
+       cur--;
+    } while(cur >= 0 && is_utlb_match(env->sorted_utlb[cur], address));
+
+    /* plus search */
+    cur = save + 1;
+    while (cur < env->sorted_utlb_num &&
+          is_utlb_match(env->sorted_utlb[cur], address)) {
+       tlb_t * e = env->sorted_utlb[cur];
+       if (e->sh || (!use_asid) || e->asid == asid) {
+           if (ret != MMU_DTLB_MISS)
+               return MMU_DTLB_MULTIPLE;
+           ret = e - &env->utlb[0];
+       }
+       cur++;
+    } 
+
+    return ret;
 }
 
 /* Match address against MMU
@@ -525,6 +579,63 @@
     return physical;
 }
 
+static void add_entry_to_sorted_utlb(CPUState * env, tlb_t * entry)
+{
+    int i;
+
+    if (env->sorted_utlb_num == 0) {
+       env->sorted_utlb[0] = entry;
+       env->sorted_utlb_num = 1;
+       return;
+    }
+
+    if (entry->vpn <= env->sorted_utlb[0]->vpn) {
+       memmove(&env->sorted_utlb[1],
+               &env->sorted_utlb[0],
+               sizeof(tlb_t *) * env->sorted_utlb_num);
+       env->sorted_utlb[0] = entry;
+       env->sorted_utlb_num++;
+       return;
+    }
+
+    for (i = 0; i < env->sorted_utlb_num - 1; i++) {
+       tlb_t * e1 = env->sorted_utlb[i];
+       tlb_t * e2 = env->sorted_utlb[i + 1];
+       if (e1->vpn <= entry->vpn && entry->vpn <= e2->vpn) {
+           memmove(&env->sorted_utlb[i + 2],
+                   &env->sorted_utlb[i + 1],
+                   sizeof(tlb_t *) * (env->sorted_utlb_num - i - 1));
+           env->sorted_utlb[i + 1] = entry;
+           env->sorted_utlb_num++;
+           return;
+       }
+    }
+
+#if 0
+    assert(env->sorted_utlb_num < UTLB_SIZE);
+#endif
+    env->sorted_utlb[env->sorted_utlb_num] = entry;
+    env->sorted_utlb_num++;
+}
+
+static void remove_entry_from_sorted_utlb(CPUState * env, tlb_t * entry)
+{
+    int i;
+#if 0
+    assert(env->sorted_utlb_num > 0);
+#endif
+
+    for (i = 0; i < env->sorted_utlb_num; i++) {
+       if (env->sorted_utlb[i] == entry) {
+           env->sorted_utlb_num--;
+           memmove(&env->sorted_utlb[i],
+                   &env->sorted_utlb[i + 1],
+                   sizeof(tlb_t *) * (env->sorted_utlb_num - i));
+           return;
+       }
+    }
+}
+
 void cpu_load_tlb(CPUState * env)
 {
     int n = cpu_mmucr_urc(env->mmucr);
@@ -536,6 +647,8 @@
        if (!same_tlb_entry_exists(env->itlb, ITLB_SIZE, entry)) {
            tlb_flush_page(env, address);
        }
+
+       remove_entry_from_sorted_utlb(env, entry);
     }
 
     /* Take values into cpu status from registers. */
@@ -568,6 +681,9 @@
     entry->wt   = (uint8_t)cpu_ptel_wt(env->ptel);
     entry->sa   = (uint8_t)cpu_ptea_sa(env->ptea);
     entry->tc   = (uint8_t)cpu_ptea_tc(env->ptea);
+
+    /* add to sorted list */
+    add_entry_to_sorted_utlb(env, entry);
 }
 
 void cpu_sh4_write_mmaped_utlb_addr(CPUSH4State *s, target_phys_addr_t addr,
@@ -599,8 +715,12 @@
                    s->tea = addr;
                    break;
                }
-               if (entry->v && !v)
+               if (entry->v && !v) {
                    needs_tlb_flush = 1;
+                   remove_entry_from_sorted_utlb(s, entry);
+               } else if (!entry->v && v){
+                   add_entry_to_sorted_utlb(s, entry);
+               }
                entry->v = v;
                entry->d = d;
                utlb_match_entry = entry;
@@ -635,6 +755,7 @@
            if (!same_tlb_entry_exists(s->itlb, ITLB_SIZE, entry)) {
                tlb_flush_page(s, address);
            }
+           remove_entry_from_sorted_utlb(s, entry);
        }
        entry->asid = asid;
        entry->vpn = vpn;




reply via email to

[Prev in Thread] Current Thread [Next in Thread]