---------------------------------------------------------------------------------
Listing 1: gcc 4.0 produced code, using new gcc 4.0 options

make[2]: Entering directory `/cygdrive/e/var/visualstudioworkspace/gnubg/gnubg'
if /cygdrive/e/programme/gcc4/bin/gcc.exe -DHAVE_CONFIG_H -I. -I. -I. -I./lib  -
I/usr/include/libxml2 -I./intl -DLOCALEDIR=\"/usr/local/share/locale\" -I/includ
e/freetype2 -I/include -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include
  -ftree-vectorizer-verbose=1  -fgcse-sm -fgcse-sm -fgcse-las -fgcse-after-reloa
d -floop-optimize2 -fivopts -fbranch-target-load-optimize  -ftree-vectorize -mar
ch=pentium4 -mtune=pentium4 -msse2 -mfpmath=sse -maccumulate-outgoing-args -mali
gn-double -fomit-frame-pointer -momit-leaf-frame-pointer -O3 -minline-all-string
ops -ffast-math -freg-struct-return -pipe -MT eval.o -MD -MP -MF ".deps/eval.Tpo
" -c -o eval.o eval.c; \

---------------------------------------------------------------------------------

;; 
;; Load parameters
;;

	movl	44(%esp), %eax
	movl	4(%eax), %esi
	testl	%esi, %esi
	movss	16(%esp), %xmm1
	jle	L115

;; 
;; Alignment analysis, yeah! Depending on the lower
;; 3 bits of the address a pre-loop (not exactly a loop
;; but kind of a switch) is used to align data.
;;

	leal	-1(%esi), %eax
	andl	$7, %eax
	movss	(%ebx), %xmm0
	mulss	(%ebp), %xmm0
	addss	%xmm0, %xmm1
	addl	$4, %ebx
	movl	$1, %ecx
	cmpl	%esi, %ecx

;; The switch-type preloop selector

	je	L115
	testl	%eax, %eax
	je	L117
	cmpl	$1, %eax
	je	L234
	cmpl	$2, %eax
	.p2align 4,,5
	je	L235
	cmpl	$3, %eax
	.p2align 4,,5
	je	L236
	cmpl	$4, %eax
	.p2align 4,,5
	je	L237
	cmpl	$5, %eax
	.p2align 4,,5
	je	L238
	cmpl	$6, %eax
	.p2align 4,,5
	je	L239
	
;; The actual preloop statements

	movss	(%ebx), %xmm0
	mulss	4(%ebp), %xmm0
	addss	%xmm0, %xmm1
	addl	$4, %ebx
	movb	$2, %cl
L239:
	movss	(%ebx), %xmm0
	mulss	(%ebp,%ecx,4), %xmm0
	addss	%xmm0, %xmm1
	addl	$4, %ebx
	addl	$1, %ecx
L238:
	movss	(%ebx), %xmm0
	mulss	(%ebp,%ecx,4), %xmm0
	addss	%xmm0, %xmm1
	addl	$4, %ebx
	addl	$1, %ecx
L237:
	movss	(%ebx), %xmm0
	mulss	(%ebp,%ecx,4), %xmm0
	addss	%xmm0, %xmm1
	addl	$4, %ebx
	addl	$1, %ecx
L236:
	movss	(%ebx), %xmm0
	mulss	(%ebp,%ecx,4), %xmm0
	addss	%xmm0, %xmm1
	addl	$4, %ebx
	addl	$1, %ecx
L235:
	movss	(%ebx), %xmm0
	mulss	(%ebp,%ecx,4), %xmm0
	addss	%xmm0, %xmm1
	addl	$4, %ebx
	addl	$1, %ecx
L234:
	movss	(%ebx), %xmm0
	mulss	(%ebp,%ecx,4), %xmm0
	addss	%xmm0, %xmm1
	addl	$4, %ebx
	addl	$1, %ecx
	cmpl	%esi, %ecx
	je	L115


;;
;; Main loop, unrolled quite a bit and step 4 (quadword).
;;
;; But this seems to be still faulty. Alignment was done,
;; but still no vector operations are used? That should be
;; mulps, movaps and addps?! But I'm not sure because the ASM
;; syntax of gnu is different, may be the 4 in 
;; 20(%ebp,%edx,4), %xmm0. Really unsure, but I lack time
;; to look it up.
;; Nevertheless, if this is equivalent to the "ps" notation
;; of Intel's asm, the loop would be pretty fast.
;;
;; Also note that the braindead memory access in the hot loop
;; is gone, all is kept in registers until the end.

L117:
	movss	(%ebx), %xmm0
	mulss	(%ebp,%ecx,4), %xmm0
	addss	%xmm0, %xmm1
	leal	4(%ebx), %eax
	leal	1(%ecx), %edx
	movss	(%eax), %xmm0
	mulss	(%ebp,%edx,4), %xmm0
	addss	%xmm0, %xmm1
	movss	4(%eax), %xmm0
	mulss	4(%ebp,%edx,4), %xmm0
	addss	%xmm0, %xmm1
	movss	8(%eax), %xmm0
	mulss	8(%ebp,%edx,4), %xmm0
	addss	%xmm0, %xmm1
	movss	12(%eax), %xmm0
	mulss	12(%ebp,%edx,4), %xmm0
	addss	%xmm0, %xmm1
	movss	16(%eax), %xmm0
	mulss	16(%ebp,%edx,4), %xmm0
	addss	%xmm0, %xmm1
	movss	20(%eax), %xmm0
	mulss	20(%ebp,%edx,4), %xmm0
	addss	%xmm0, %xmm1
	movss	24(%eax), %xmm0
	mulss	24(%ebp,%edx,4), %xmm0
	addss	%xmm0, %xmm1
	leal	28(%eax), %ebx
	leal	7(%edx), %ecx
	cmpl	%esi, %ecx
	jne	L117
L115:
	movl	$LC11, (%esp)
	movss	%xmm1, 16(%esp)