qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

TCG global variable aliasing/optimization question


From: Taylor Simpson
Subject: TCG global variable aliasing/optimization question
Date: Fri, 6 Mar 2020 15:55:40 +0000

Does the TCG optimizer assume all global variables are unique?  If so, is there a method to indicate that two global variables alias?

 

Background:

I am improving the way we handle register pairs for Hexagon.  The original implementation would read from the individual 32-bit registers and concat to form the 64-bit value

tcg_gen_concat_i32_i64(val64, hex_gpr[NUM], hex_gpr[(NUM) + 1]);

Similarly, a write would break apart the 64-bit value into two parts and store them individually

/* Low word */

tcg_gen_extrl_i64_i32(val32, val64);

tcg_gen_mov_tl(hex_gpr[rnum], val32);

/* High word */

tcg_gen_extrh_i64_i32(val32, val64);

tcg_gen_mov_tl(hex_grp[rnum + 1], val32);

 

I’m hoping to get more efficient code by creating an array of global i64 variables that overlap the i32 single registers.

for (i = 0; i < TOTAL_PER_THREAD_REGS; i++) {

    hex_gpr[i] = tcg_global_mem_new(cpu_env,

        offsetof(CPUHexagonState, gpr[i]), hexagon_regnames[i]);

}

for (i = 0; i < TOTAL_PER_THREAD_REGS/2; i++) {

    hex_gpr_pairs[i] = tcg_global_mem_new_i64(cpu_env,

       offsetof(CPUHexagonState, gpr[2 * i]), hexagon_pairnames[i]);

}

So, a read would be

                tcg_gen_mov_i64(val64, hex_gpr_pairs[NUM/2]);

and a write would be

                tcg_gen_mov_i64(hex_gpr_pairs[NUM/2], val64);

 

Unfortunately, it seems TCG is optimizing with the assumption that globals don’t overlap.  Here’s an example

                {

                    r4 = ##0x11111111

                    r5 = ##0x22222222

                }

                {

                    p1 = cmp.eq(r3:2,r5:4)

                    r4 = r17

                    jump 1f

                }

Here is the TCG

---- 00400094

movi_i32 pc,$0x400094

movi_i32 slot_cancelled,$0x0

movi_i32 pred_written,$0x0

movi_i32 loc2,$0x11111111

mov_i32 new_r4,loc2

movi_i32 loc2,$0x22222222

mov_i32 new_r5,loc2

mov_i32 r4,new_r4                                                        /* Assignment to r4 value is 0x11111111 */

mov_i32 r5,new_r5                                                        /* Assignment to r5 value is 0x22222222 */

movi_i32 tmp0,$0x1

add_i32 pkt_cnt,pkt_cnt,tmp0

movi_i32 tmp0,$0x2

add_i32 insn_cnt,insn_cnt,tmp0

---- 004000a4

movi_i32 pc,$0x4000a4

movi_i32 slot_cancelled,$0x0

movi_i32 branch_taken,$0x0

movi_i32 next_PC,$0x4000ac

movi_i32 pred_written,$0x0

mov_i64 loc3,r3:2

mov_i64 loc4,r5:4                                                            /* Read from register pair r5:4 */

movi_i64 tmp5,$0xff

movi_i64 tmp6,$0x0

movcond_i64 tmp7,loc3,loc4,tmp5,tmp6,eq

extrl_i64_i32 loc2,tmp7

ext8u_i32 loc2,loc2

movi_i32 tmp0,$0x0

ext8u_i32 tmp1,loc2

and_i32 tmp8,tmp1,new_pred_p1

movi_i32 tmp10,$0x2

and_i32 tmp9,pred_written,tmp10

movcond_i32 new_pred_p1,tmp9,tmp0,tmp8,tmp1,ne

movi_i32 tmp10,$0x2

or_i32 pred_written,pred_written,tmp10

mov_i32 loc2,r17

movi_i32 tmp1,$0x8

add_i32 tmp0,pc,tmp1

movi_i32 tmp1,$0x0

movcond_i32 next_PC,branch_taken,tmp1,next_PC,tmp0,ne

movi_i32 branch_taken,$0x1

mov_i32 new_r4,loc2

mov_i32 r4,new_r4                                                        /* Assignment to r4 from r17 */

movi_i32 tmp0,$0x0

mov_i32 p1,new_pred_p1

mov_i32 pc,next_PC

movi_i32 tmp0,$0x1

add_i32 pkt_cnt,pkt_cnt,tmp0

movi_i32 tmp0,$0x2

add_i32 insn_cnt,insn_cnt,tmp0

exit_tb $0x0

set_label $L0

exit_tb $0x55bb47db6043

 

Here is the generated x86 code.

OUT: [size=186]

0x55bb47db6100:  mov    -0x8(%rbp),%ebx

0x55bb47db6103:  test   %ebx,%ebx

0x55bb47db6105:  jl     0x55bb47db61ae

0x55bb47db610b:  mov    $0x22222222,%ebx

0x55bb47db6110:  mov    %ebx,0x138(%rbp)

0x55bb47db6116:  mov    %ebx,0x14(%rbp)

0x55bb47db6119:  mov    0xd0(%rbp),%ebx

0x55bb47db611f:  inc    %ebx

0x55bb47db6121:  mov    0xd4(%rbp),%r12d

0x55bb47db6128:  add    $0x2,%r12d

0x55bb47db612c:  movl   $0x0,0x120(%rbp)

0x55bb47db6136:  mov    0x8(%rbp),%r13

0x55bb47db613a:  mov    0x10(%rbp),%r14

0x55bb47db613e:  mov    $0xff,%r15d

0x55bb47db6144:  xor    %r10d,%r10d

0x55bb47db6147:  cmp    %r14,%r13

0x55bb47db614a:  cmove  %r15,%r10

0x55bb47db614e:  mov    %r10d,%r13d

0x55bb47db6151:  mov    %r13d,0x32c(%rbp)

0x55bb47db6158:  movl   $0x2,0x338(%rbp)

0x55bb47db6162:  mov    $0x4000ac,%r14d

0x55bb47db6168:  mov    %r14d,0x114(%rbp)

0x55bb47db616f:  movl   $0x1,0x110(%rbp)

0x55bb47db6179:  mov    0x44(%rbp),%r15d

0x55bb47db617d:  mov    %r15d,0x134(%rbp)

0x55bb47db6184:  mov    %r15d,0x10(%rbp)

0x55bb47db6188:  mov    %r13d,0x104(%rbp)

0x55bb47db618f:  mov    %r14d,0xa4(%rbp)

0x55bb47db6196:  inc    %ebx

0x55bb47db6198:  mov    %ebx,0xd0(%rbp)

0x55bb47db619e:  lea    0x2(%r12),%ebx

0x55bb47db61a3:  mov    %ebx,0xd4(%rbp)

0x55bb47db61a9:  jmpq   0x55bb47db6016

0x55bb47db61ae:  lea    -0x172(%rip),%rax        # 0x55bb47db6043

0x55bb47db61b5:  jmpq   0x55bb47db6018

The first assignment to r4 with 0x11111111 has been removed.  I guess this is because the second assignment makes it look dead, but it is NOT dead because the read from r5:4 accesses the value.

 

Thanks,

Taylor

 


reply via email to

[Prev in Thread] Current Thread [Next in Thread]