--------------------------------------------------------------------------------- Listing 1: gcc 4.0 produced code, using new gcc 4.0 options make[2]: Entering directory `/cygdrive/e/var/visualstudioworkspace/gnubg/gnubg' if /cygdrive/e/programme/gcc4/bin/gcc.exe -DHAVE_CONFIG_H -I. -I. -I. -I./lib - I/usr/include/libxml2 -I./intl -DLOCALEDIR=\"/usr/local/share/locale\" -I/includ e/freetype2 -I/include -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include -ftree-vectorizer-verbose=1 -fgcse-sm -fgcse-sm -fgcse-las -fgcse-after-reloa d -floop-optimize2 -fivopts -fbranch-target-load-optimize -ftree-vectorize -mar ch=pentium4 -mtune=pentium4 -msse2 -mfpmath=sse -maccumulate-outgoing-args -mali gn-double -fomit-frame-pointer -momit-leaf-frame-pointer -O3 -minline-all-string ops -ffast-math -freg-struct-return -pipe -MT eval.o -MD -MP -MF ".deps/eval.Tpo " -c -o eval.o eval.c; \ --------------------------------------------------------------------------------- ;; ;; Load parameters ;; movl 44(%esp), %eax movl 4(%eax), %esi testl %esi, %esi movss 16(%esp), %xmm1 jle L115 ;; ;; Alignment analysis, yeah! Depending on the lower ;; 3 bits of the address a pre-loop (not exactly a loop ;; but kind of a switch) is used to align data. ;; leal -1(%esi), %eax andl $7, %eax movss (%ebx), %xmm0 mulss (%ebp), %xmm0 addss %xmm0, %xmm1 addl $4, %ebx movl $1, %ecx cmpl %esi, %ecx ;; The switch-type preloop selector je L115 testl %eax, %eax je L117 cmpl $1, %eax je L234 cmpl $2, %eax .p2align 4,,5 je L235 cmpl $3, %eax .p2align 4,,5 je L236 cmpl $4, %eax .p2align 4,,5 je L237 cmpl $5, %eax .p2align 4,,5 je L238 cmpl $6, %eax .p2align 4,,5 je L239 ;; The actual preloop statements movss (%ebx), %xmm0 mulss 4(%ebp), %xmm0 addss %xmm0, %xmm1 addl $4, %ebx movb $2, %cl L239: movss (%ebx), %xmm0 mulss (%ebp,%ecx,4), %xmm0 addss %xmm0, %xmm1 addl $4, %ebx addl $1, %ecx L238: movss (%ebx), %xmm0 mulss (%ebp,%ecx,4), %xmm0 addss %xmm0, %xmm1 addl $4, %ebx addl $1, %ecx L237: movss (%ebx), %xmm0 mulss (%ebp,%ecx,4), %xmm0 addss %xmm0, %xmm1 addl $4, %ebx addl $1, %ecx L236: movss (%ebx), %xmm0 mulss (%ebp,%ecx,4), %xmm0 addss %xmm0, %xmm1 addl $4, %ebx addl $1, %ecx L235: movss (%ebx), %xmm0 mulss (%ebp,%ecx,4), %xmm0 addss %xmm0, %xmm1 addl $4, %ebx addl $1, %ecx L234: movss (%ebx), %xmm0 mulss (%ebp,%ecx,4), %xmm0 addss %xmm0, %xmm1 addl $4, %ebx addl $1, %ecx cmpl %esi, %ecx je L115 ;; ;; Main loop, unrolled quite a bit and step 4 (quadword). ;; ;; But this seems to be still faulty. Alignment was done, ;; but still no vector operations are used? That should be ;; mulps, movaps and addps?! But I'm not sure because the ASM ;; syntax of gnu is different, may be the 4 in ;; 20(%ebp,%edx,4), %xmm0. Really unsure, but I lack time ;; to look it up. ;; Nevertheless, if this is equivalent to the "ps" notation ;; of Intel's asm, the loop would be pretty fast. ;; ;; Also note that the braindead memory access in the hot loop ;; is gone, all is kept in registers until the end. L117: movss (%ebx), %xmm0 mulss (%ebp,%ecx,4), %xmm0 addss %xmm0, %xmm1 leal 4(%ebx), %eax leal 1(%ecx), %edx movss (%eax), %xmm0 mulss (%ebp,%edx,4), %xmm0 addss %xmm0, %xmm1 movss 4(%eax), %xmm0 mulss 4(%ebp,%edx,4), %xmm0 addss %xmm0, %xmm1 movss 8(%eax), %xmm0 mulss 8(%ebp,%edx,4), %xmm0 addss %xmm0, %xmm1 movss 12(%eax), %xmm0 mulss 12(%ebp,%edx,4), %xmm0 addss %xmm0, %xmm1 movss 16(%eax), %xmm0 mulss 16(%ebp,%edx,4), %xmm0 addss %xmm0, %xmm1 movss 20(%eax), %xmm0 mulss 20(%ebp,%edx,4), %xmm0 addss %xmm0, %xmm1 movss 24(%eax), %xmm0 mulss 24(%ebp,%edx,4), %xmm0 addss %xmm0, %xmm1 leal 28(%eax), %ebx leal 7(%edx), %ecx cmpl %esi, %ecx jne L117 L115: movl $LC11, (%esp) movss %xmm1, 16(%esp)