[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] sh : performance problem
From: |
Shin-ichiro KAWASAKI |
Subject: |
[Qemu-devel] sh : performance problem |
Date: |
Fri, 27 Feb 2009 01:28:07 +0900 |
User-agent: |
Thunderbird 2.0.0.19 (Windows/20081209) |
Hi, all.
One of the current problems of qemu-sh4 system emulation is performance.
Kernel boot process seems not so slow, but userland process performance
is too bad. This problem can be seen numerically when you compile simple '.c'
source with empty main(). The result is as follows.
sh4 : 5.8 [seconds] (-M r2d + Fedora 6)
arm : 0.8 [seconds] (-M versatilepb + Debian ARM)
sh4 is 7 times slower than arm...
/* I repeated the compile work until the measured time converges. */
Using Oprofile, we can see what qemu doing while the compile work on sh4
system emulation. Top 20 time consuming functions are as follows.
---------------------------------------------------------------------------
CPU: Core 2, speed 1600 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask
of 0x00 (Unhalted core cycles) count 100000
samples % image name app name symbol name
38481 25.0550 qemu-system-sh4 qemu-system-sh4
find_tlb_entry
19836 12.9152 anon (tgid:2483 range:0xb2181000-0xb3381000) qemu-system-sh4
(no symbols)
9333 6.0767 qemu-system-sh4 qemu-system-sh4 cpu_sh4_exec
8180 5.3260 qemu-system-sh4 qemu-system-sh4
tcg_reg_alloc_op
7236 4.7114 qemu-system-sh4 qemu-system-sh4 temp_save
7051 4.5909 qemu-system-sh4 qemu-system-sh4
tcg_liveness_analysis
4339 2.8251 libc-2.8.90.so libc-2.8.90.so (no symbols)
3414 2.2229 qemu-system-sh4 qemu-system-sh4
tlb_set_page_exec
3017 1.9644 qemu-system-sh4 qemu-system-sh4 decode_opc
2802 1.8244 qemu-system-sh4 qemu-system-sh4
get_physical_address
sa2358 1.5353 qemu-system-sh4 qemu-system-sh4
find_itlb_entry
2215 1.4422 qemu-system-sh4 qemu-system-sh4
cpu_sh4_write_mmaped_utlb_addr
1771 1.1531 qemu-system-sh4 qemu-system-sh4
tcg_gen_code_search_pc
1746 1.1368 qemu-system-sh4 qemu-system-sh4 __ldl_mmu
1663 1.0828 qemu-system-sh4 qemu-system-sh4
cpu_sh4_handle_mmu_fault
1557 1.0138 qemu-system-sh4 qemu-system-sh4
gen_intermediate_code_pc
1328 0.8647 qemu-system-sh4 qemu-system-sh4
tcg_temp_new_internal_i32
1233 0.8028 qemu-system-arm qemu-system-arm cpu_arm_exec
1229 0.8002 vmlinux vmlinux vmi_activate
1046 0.6811 Xvnc4 Xvnc4 (no symbols)
---------------------------------------------------------------------------
Most time consuming function is 'find_tlb_entry()', which search the sh4's TLB
entries to find an entry which matches with given address.
If my understanding is right, this search happens when TLB miss happens.
Too many TLB misses causes bad perfomance, I guess.
The actions to solve this problem will be,
(i) tune up 'find_tlb_entry()'
(ii) reduce TLB miss by expanding page size
(iii) reduce TLB miss by increase the number of TLB entries virtually,
more than the real cpu has.
First, I tried (i). The attached patch introduces binary search.
It shortens the gcc compile time from 5.8 seconds to 4.6 seconds :
it make sh4 system emulation 20% faster.
'find_tlb_entry()' searches unified tlb array with 64 entries, and
instruction tlb array with 4 entries. This patch focus only on unified
tlb array search.
This patch is rather rough one. Any advise to brush it up will be appreciated.
I'm going to work on approaches (ii) and (iii). Advises for them will be
thanked too.
Regards,
Shin-ichiro KAWASAKI
Index: trunk/target-sh4/cpu.h
===================================================================
--- trunk/target-sh4/cpu.h (revision 6628)
+++ trunk/target-sh4/cpu.h (working copy)
@@ -137,13 +137,16 @@
uint32_t intevt; /* interrupt event register */
uint32_t pvr; /* Processor Version Register */
uint32_t prr; /* Processor Revision Register */
uint32_t cvr; /* Cache Version Register */
- CPU_COMMON tlb_t utlb[UTLB_SIZE]; /* unified translation table */
+ CPU_COMMON
+ tlb_t utlb[UTLB_SIZE]; /* unified translation table */
+ tlb_t * sorted_utlb[UTLB_SIZE];
+ uint32_t sorted_utlb_num;
tlb_t itlb[ITLB_SIZE]; /* instruction translation table */
void *intc_handle;
int intr_at_halt; /* SR_BL ignored during sleep */
} CPUSH4State;
CPUSH4State *cpu_sh4_init(const char *cpu_model);
Index: trunk/target-sh4/helper.c
===================================================================
--- trunk/target-sh4/helper.c (revision 6628)
+++ trunk/target-sh4/helper.c (working copy)
@@ -322,7 +322,7 @@
if (e == MMU_DTLB_MULTIPLE)
e = MMU_ITLB_MULTIPLE;
else if (e == MMU_DTLB_MISS && update) {
- e = find_tlb_entry(env, address, env->utlb, UTLB_SIZE, use_asid);
+ e = find_utlb_entry(env, address, use_asid);
if (e >= 0) {
tlb_t * ientry;
n = itlb_replacement(env);
@@ -342,15 +342,69 @@
return e;
}
+static inline int is_utlb_match(tlb_t * e, uint32_t addr)
+{
+ return (e->vpn << 10 & ~(e->size - 1)) == (addr & ~(e->size - 1));
+}
+
/* Find utlb entry
Return entry, MMU_DTLB_MISS, MMU_DTLB_MULTIPLE */
int find_utlb_entry(CPUState * env, target_ulong address, int use_asid)
{
+ int min = 0;
+ int max = env->sorted_utlb_num - 1;
+ int cur = (min + max) / 2 ;
+ int save;
+ uint8_t asid = env->pteh & 0xff;
+ int ret = MMU_DTLB_MISS;
+
/* per utlb access */
increment_urc(env);
- /* Return entry */
- return find_tlb_entry(env, address, env->utlb, UTLB_SIZE, use_asid);
+ if (env->sorted_utlb_num <= 0)
+ return MMU_DTLB_MISS;
+
+ /* binary search */
+ while (!is_utlb_match(env->sorted_utlb[cur], address)) {
+ if (min >= max)
+ return MMU_DTLB_MISS;
+ if (((env->sorted_utlb[cur]->vpn << 10)
+ & ~(env->sorted_utlb[cur]->size - 1)) <
+ (address & ~(env->sorted_utlb[cur]->size - 1))) {
+ min = cur + 1;
+ } else {
+ max = cur - 1;
+ }
+ cur = (min + max) / 2;
+ }
+
+ save = cur;
+
+ /* minus search */
+ do {
+ tlb_t * e = env->sorted_utlb[cur];
+ if (e->sh || (!use_asid) || e->asid == asid) {
+ if (ret != MMU_DTLB_MISS)
+ return MMU_DTLB_MULTIPLE;
+ ret = e - &env->utlb[0];
+ }
+ cur--;
+ } while(cur >= 0 && is_utlb_match(env->sorted_utlb[cur], address));
+
+ /* plus search */
+ cur = save + 1;
+ while (cur < env->sorted_utlb_num &&
+ is_utlb_match(env->sorted_utlb[cur], address)) {
+ tlb_t * e = env->sorted_utlb[cur];
+ if (e->sh || (!use_asid) || e->asid == asid) {
+ if (ret != MMU_DTLB_MISS)
+ return MMU_DTLB_MULTIPLE;
+ ret = e - &env->utlb[0];
+ }
+ cur++;
+ }
+
+ return ret;
}
/* Match address against MMU
@@ -525,6 +579,63 @@
return physical;
}
+static void add_entry_to_sorted_utlb(CPUState * env, tlb_t * entry)
+{
+ int i;
+
+ if (env->sorted_utlb_num == 0) {
+ env->sorted_utlb[0] = entry;
+ env->sorted_utlb_num = 1;
+ return;
+ }
+
+ if (entry->vpn <= env->sorted_utlb[0]->vpn) {
+ memmove(&env->sorted_utlb[1],
+ &env->sorted_utlb[0],
+ sizeof(tlb_t *) * env->sorted_utlb_num);
+ env->sorted_utlb[0] = entry;
+ env->sorted_utlb_num++;
+ return;
+ }
+
+ for (i = 0; i < env->sorted_utlb_num - 1; i++) {
+ tlb_t * e1 = env->sorted_utlb[i];
+ tlb_t * e2 = env->sorted_utlb[i + 1];
+ if (e1->vpn <= entry->vpn && entry->vpn <= e2->vpn) {
+ memmove(&env->sorted_utlb[i + 2],
+ &env->sorted_utlb[i + 1],
+ sizeof(tlb_t *) * (env->sorted_utlb_num - i - 1));
+ env->sorted_utlb[i + 1] = entry;
+ env->sorted_utlb_num++;
+ return;
+ }
+ }
+
+#if 0
+ assert(env->sorted_utlb_num < UTLB_SIZE);
+#endif
+ env->sorted_utlb[env->sorted_utlb_num] = entry;
+ env->sorted_utlb_num++;
+}
+
+static void remove_entry_from_sorted_utlb(CPUState * env, tlb_t * entry)
+{
+ int i;
+#if 0
+ assert(env->sorted_utlb_num > 0);
+#endif
+
+ for (i = 0; i < env->sorted_utlb_num; i++) {
+ if (env->sorted_utlb[i] == entry) {
+ env->sorted_utlb_num--;
+ memmove(&env->sorted_utlb[i],
+ &env->sorted_utlb[i + 1],
+ sizeof(tlb_t *) * (env->sorted_utlb_num - i));
+ return;
+ }
+ }
+}
+
void cpu_load_tlb(CPUState * env)
{
int n = cpu_mmucr_urc(env->mmucr);
@@ -536,6 +647,8 @@
if (!same_tlb_entry_exists(env->itlb, ITLB_SIZE, entry)) {
tlb_flush_page(env, address);
}
+
+ remove_entry_from_sorted_utlb(env, entry);
}
/* Take values into cpu status from registers. */
@@ -568,6 +681,9 @@
entry->wt = (uint8_t)cpu_ptel_wt(env->ptel);
entry->sa = (uint8_t)cpu_ptea_sa(env->ptea);
entry->tc = (uint8_t)cpu_ptea_tc(env->ptea);
+
+ /* add to sorted list */
+ add_entry_to_sorted_utlb(env, entry);
}
void cpu_sh4_write_mmaped_utlb_addr(CPUSH4State *s, target_phys_addr_t addr,
@@ -599,8 +715,12 @@
s->tea = addr;
break;
}
- if (entry->v && !v)
+ if (entry->v && !v) {
needs_tlb_flush = 1;
+ remove_entry_from_sorted_utlb(s, entry);
+ } else if (!entry->v && v){
+ add_entry_to_sorted_utlb(s, entry);
+ }
entry->v = v;
entry->d = d;
utlb_match_entry = entry;
@@ -635,6 +755,7 @@
if (!same_tlb_entry_exists(s->itlb, ITLB_SIZE, entry)) {
tlb_flush_page(s, address);
}
+ remove_entry_from_sorted_utlb(s, entry);
}
entry->asid = asid;
entry->vpn = vpn;
- [Qemu-devel] sh : performance problem,
Shin-ichiro KAWASAKI <=