traverso-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Traverso-commit] traverso CMakeLists.txt src/core/CMakeLists.txt...


From: Remon Sijrier
Subject: [Traverso-commit] traverso CMakeLists.txt src/core/CMakeLists.txt...
Date: Wed, 07 Nov 2007 20:28:21 +0000

CVSROOT:        /sources/traverso
Module name:    traverso
Changes by:     Remon Sijrier <r_sijrier>       07/11/07 20:28:21

Modified files:
        .              : CMakeLists.txt 
        src/core       : CMakeLists.txt 
        src/engine     : CMakeLists.txt 
Added files:
        src/common     : sse_functions_64bit.S sse_functions.S 

Log message:
        * ported optimization stuff from qmake build system to cmake build 
system. Somehow the runtime sse detection doesn't work :(

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/traverso/CMakeLists.txt?cvsroot=traverso&r1=1.7&r2=1.8
http://cvs.savannah.gnu.org/viewcvs/traverso/src/common/sse_functions_64bit.S?cvsroot=traverso&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/traverso/src/common/sse_functions.S?cvsroot=traverso&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/traverso/src/core/CMakeLists.txt?cvsroot=traverso&r1=1.2&r2=1.3
http://cvs.savannah.gnu.org/viewcvs/traverso/src/engine/CMakeLists.txt?cvsroot=traverso&r1=1.3&r2=1.4

Patches:
Index: CMakeLists.txt
===================================================================
RCS file: /sources/traverso/traverso/CMakeLists.txt,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -b -r1.7 -r1.8
--- CMakeLists.txt      7 Nov 2007 13:08:17 -0000       1.7
+++ CMakeLists.txt      7 Nov 2007 20:28:20 -0000       1.8
@@ -23,13 +23,13 @@
 OPTION(WANT_ALSA       "Include ALSA (Advanced Linux Sound Architecture) 
support" ON)
 OPTION(WANT_PORTAUDIO  "Include PortAudio support (Driver support for Mac OS X 
and Windows" ON)
 OPTION(WANT_LV2                "Include LV2 Plugin support" ON)
-OPTION(USE_SYSTEM_SLV2_LIBRARY "Use system installed slv2 library. If not 
found, the internal one is compiled and used which needs the rdf development 
headers" OFF)
+OPTION(USE_SYSTEM_SLV2_LIBRARY "Use system installed slv2 library. If not 
found, the internal one is compiled and used which needs the rdf development 
headers" ON)
 OPTION(WANT_MP3_DECODE "Include mp3 decoding support, for playing mp3 files" 
ON)
 OPTION(WANT_MP3_ENCODE "Include mp3 encoding support, for creating mp3 files" 
ON)
-OPTION(WANT_PCH        "Use precompiled headers" ON)
-OPTION(WANT_DEBUG      "Debug build" ON)
+OPTION(WANT_PCH        "Use precompiled headers" OFF)
+OPTION(WANT_DEBUG      "Debug build" OFF)
 OPTION(WANT_OPENGL     "Build Traverso with OpenGL support" ON)
-OPTION(WANT_TRAVERSO_DEBUG "Provides 4 levels of debug ouput on the command 
line" ON)
+OPTION(WANT_TRAVERSO_DEBUG "Provides 4 levels of debug ouput on the command 
line, always on for DEBUG builds" OFF)
 
 
 SET(MAIN_DIR_NAME "src")
@@ -289,22 +289,6 @@
 ENDIF(FFTW3_FOUND)
 
 
-# Check GCC for PCH support
-SET(USE_PCH FALSE)
-IF(WANT_PCH)
-    FIND_PACKAGE(PCHSupport)
-    IF(PCHSupport_FOUND)
-        SET(USE_PCH TRUE)
-        SET(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} -include 
${CMAKE_SOURCE_DIR}/src/precompile.h -Winvalid-pch")
-        SET(CMAKE_CXX_FLAGS_RELEASE  "${CMAKE_CXX_FLAGS_RELEASE} -include 
${CMAKE_SOURCE_DIR}/src/precompile.h -Winvalid-pch")
-        MESSAGE(STATUS "Enabling precompiled headers for GCC 
${gcc_compiler_version}") 
-    ENDIF(PCHSupport_FOUND)
-ELSE(WANT_PCH)
-#QT_DEFENITIONS are added by pch, so when no pch support add them now!
-       ADD_DEFINITIONS(${QT_DEFINITIONS})
-ENDIF(WANT_PCH)
-
-
 CHECK_INCLUDE_FILE("sys/vfs.h" HAVE_SYS_VFS_H)
 IF(HAVE_SYS_VFS_H)
        ADD_DEFINITIONS(-DHAVE_SYS_VFS_H)
@@ -357,6 +341,87 @@
 ENDIF(APPLE)
 
 
+CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)
+IF(NOT HAVE_POSIX_MEMALIGN)
+       ADD_DEFINITIONS(-DNO_POSIX_MEMALIGN)
+       MESSAGE("Did not find posix_memalign(), using malloc")
+ENDIF(NOT HAVE_POSIX_MEMALIGN)
+
+
+execute_process(
+       COMMAND uname -m
+       OUTPUT_VARIABLE uname_invoke_result
+       RESULT_VARIABLE uname_failed
+)
+
+execute_process(
+       COMMAND cat /proc/cpuinfo
+       COMMAND grep ^flags
+       OUTPUT_VARIABLE cat_invoke_result
+       RESULT_VARIABLE cat_failed
+)
+
+IF(UNIX)
+       IF(NOT WANT_DEBUG)
+               SET(HOST_SUPPORTS_SSE FALSE)
+               
+               IF(cat_invoke_result MATCHES sse)
+                       SET(HOST_SUPPORTS_SSE TRUE)
+                       ADD_DEFINITIONS(-DSSE_OPTIMIZATIONS)
+               ENDIF(cat_invoke_result MATCHES sse)
+               
+               IF(cat_invoke_result MATCHES mmx)
+                       SET(CMAKE_CXX_FLAGS_RELEASE  
"${CMAKE_CXX_FLAGS_RELEASE} -mmmx")
+               ENDIF(cat_invoke_result MATCHES mmx)
+                               
+               IF(cat_invoke_result MATCHES 3dnow)
+                       SET(CMAKE_CXX_FLAGS_RELEASE  
"${CMAKE_CXX_FLAGS_RELEASE} -m3dnow")
+               ENDIF(cat_invoke_result MATCHES 3dnow)
+                               
+               IF(uname_invoke_result MATCHES i586)
+                       SET(CMAKE_CXX_FLAGS_RELEASE  
"${CMAKE_CXX_FLAGS_RELEASE} -march=i586")
+               ENDIF(uname_invoke_result MATCHES i586)
+                               
+               IF(uname_invoke_result MATCHES i686)
+                       SET(uname_invoke_result  "${CMAKE_CXX_FLAGS_RELEASE} 
-march=i686")
+                       IF(HOST_SUPPORTS_SSE)
+                               SET(CMAKE_CXX_FLAGS_RELEASE  
"${CMAKE_CXX_FLAGS_RELEASE} -msse -mfpmath=sse")
+                               ADD_DEFINITIONS(-DUSE_XMMINTRIN)
+                       ENDIF(HOST_SUPPORTS_SSE)
+               ENDIF(uname_invoke_result MATCHES i686)
+                               
+               IF(uname_invoke_result MATCHES x86_64)
+                       IF(HOST_SUPPORTS_SSE)
+                               SET(CMAKE_CXX_FLAGS_RELEASE  
"${CMAKE_CXX_FLAGS_RELEASE} -msse -mfpmath=sse")
+                               ADD_DEFINITIONS(-DUSE_XMMINTRIN 
-DUSE_X86_64_ASM)
+                               SET(IS_ARCH_X86_64 TRUE)
+                       ENDIF(HOST_SUPPORTS_SSE)
+               ENDIF(uname_invoke_result MATCHES x86_64)
+               
+               IF(uname_invoke_result MATCHES i[456]86)
+                       ADD_DEFINITIONS(-DARCH_X86)
+                       SET(IS_ARCH_X86 TRUE)
+               ENDIF(uname_invoke_result MATCHES i[456]86)
+       ENDIF(NOT WANT_DEBUG)
+ENDIF(UNIX)
+
+# Check GCC for PCH support
+SET(USE_PCH FALSE)
+IF(WANT_PCH)
+    FIND_PACKAGE(PCHSupport)
+    IF(PCHSupport_FOUND)
+        SET(USE_PCH TRUE)
+        SET(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} -include 
${CMAKE_SOURCE_DIR}/src/precompile.h -Winvalid-pch")
+        SET(CMAKE_CXX_FLAGS_RELEASE  "${CMAKE_CXX_FLAGS_RELEASE} -include 
${CMAKE_SOURCE_DIR}/src/precompile.h -Winvalid-pch")
+        MESSAGE(STATUS "Enabling precompiled headers for GCC 
${gcc_compiler_version}") 
+    ENDIF(PCHSupport_FOUND)
+ELSE(WANT_PCH)
+#QT_DEFENITIONS are added by pch, so when no pch support add them now!
+       ADD_DEFINITIONS(${QT_DEFINITIONS})
+ENDIF(WANT_PCH)
+
+
+
 #Add our source subdirs
 ADD_SUBDIRECTORY(src)
 

Index: src/core/CMakeLists.txt
===================================================================
RCS file: /sources/traverso/traverso/src/core/CMakeLists.txt,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -b -r1.2 -r1.3
--- src/core/CMakeLists.txt     5 Nov 2007 22:28:11 -0000       1.2
+++ src/core/CMakeLists.txt     7 Nov 2007 20:28:20 -0000       1.3
@@ -85,9 +85,27 @@
 
 QT4_WRAP_CPP(TRAVERSO_CORE_MOC_SOURCES ${TRAVERSO_CORE_MOC_CLASSES})
 
+IF(UNIX)
+       IF(HOST_SUPPORTS_SSE)
+               IF(IS_ARCH_X86)
+                       SET(ASM_FILE 
${CMAKE_SOURCE_DIR}/src/common/sse_functions.S)
+               ENDIF(IS_ARCH_X86)
+               IF(IS_ARCH_X86_64)
+                       SET(ASM_FILE 
${CMAKE_SOURCE_DIR}/src/common/sse_functions_64bit.S)
+               ENDIF(IS_ARCH_X86_64)
+               
+               add_custom_command(OUTPUT 
${CMAKE_CURRENT_BINARY_DIR}/sse_functions.o
+                       COMMAND gcc -c -pipe -O2 -fPIC ${ASM_FILE} -o 
${CMAKE_CURRENT_BINARY_DIR}/sse_functions.o
+                       DEPENDS ${ASM_FILE}
+               )
+               SET(SSE_OBJECT_FILE ${CMAKE_CURRENT_BINARY_DIR}/sse_functions.o)
+       ENDIF(HOST_SUPPORTS_SSE)
+ENDIF(UNIX)
+
+
 SET(TRAVERSO_CORE_LIBRARY "traversocore")
 
-ADD_LIBRARY(${TRAVERSO_CORE_LIBRARY} STATIC ${TRAVERSO_CORE_SOURCES} 
${TRAVERSO_CORE_MOC_SOURCES})
+ADD_LIBRARY(${TRAVERSO_CORE_LIBRARY} STATIC ${TRAVERSO_CORE_SOURCES} 
${TRAVERSO_CORE_MOC_SOURCES} ${SSE_OBJECT_FILE})
 
 IF(USE_PCH)
     ADD_DEPENDENCIES(traversocore precompiled_headers)

Index: src/engine/CMakeLists.txt
===================================================================
RCS file: /sources/traverso/traverso/src/engine/CMakeLists.txt,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -b -r1.3 -r1.4
--- src/engine/CMakeLists.txt   6 Nov 2007 18:25:16 -0000       1.3
+++ src/engine/CMakeLists.txt   7 Nov 2007 20:28:21 -0000       1.4
@@ -43,7 +43,6 @@
        )
 ENDIF(HAVE_PORTAUDIO)
 
-
 SET(TRAVERSO_ENGINE_LIBRARY "traversoaudiobackend")
 
 QT4_WRAP_CPP(TRAVERSO_ENGINE_MOC_SOURCES ${TRAVERSO_ENGINE_MOC_CLASSES})

Index: src/common/sse_functions_64bit.S
===================================================================
RCS file: src/common/sse_functions_64bit.S
diff -N src/common/sse_functions_64bit.S
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ src/common/sse_functions_64bit.S    7 Nov 2007 20:28:20 -0000       1.1
@@ -0,0 +1,609 @@
+/*
+    Copyright (C) 2005-2006 Paul Davis, John Rigg
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+       Author: Sampo Savolainen
+       64-bit conversion: John Rigg
+
+    $Id: sse_functions_64bit.S,v 1.1 2007/11/07 20:28:20 r_sijrier Exp $
+*/
+
+
+#; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int 
nframes, float gain);
+
+.globl x86_sse_mix_buffers_with_gain
+       .type   x86_sse_mix_buffers_with_gain,@function
+
+x86_sse_mix_buffers_with_gain:
+
+#; %rdi float  *dst
+#; %rsi float  *src    
+#; %rdx unsigned int nframes
+#; %xmm0 float gain
+
+       pushq %rbp
+       movq %rsp, %rbp
+
+       #; save the registers
+       pushq %rbx
+       pushq %rdi
+       pushq %rsi
+       
+       #; if nframes == 0, go to end
+       cmp     $0, %rdx
+       je      .MBWG_END
+
+       #; Check for alignment
+
+       movq %rdi, %rax
+       andq $12, %rax #; mask alignment offset
+
+       movq %rsi, %rbx
+       andq $12, %rbx #; mask alignment offset
+
+       cmp %rax, %rbx
+       jne .MBWG_NONALIGN #; if not aligned, calculate manually
+
+       #; if we are aligned
+       cmp $0, %rbx
+       jz .MBWG_SSE
+       
+       #; Pre-loop, we need to run 1-3 frames "manually" without
+       #; SSE instructions
+
+.MBWG_PRELOOP:
+       
+       #; gain is already in %xmm0
+       movss (%rsi), %xmm1
+       mulss %xmm0, %xmm1
+       addss (%rdi), %xmm1
+       movss %xmm1, (%rdi)
+
+       addq $4, %rdi #; dst++
+       addq $4, %rsi #; src++
+       decq %rdx         #; nframes--
+       jz .MBWG_END
+
+       addq $4, %rbx
+       
+       cmp $16, %rbx #; test if we've reached 16 byte alignment
+       jne .MBWG_PRELOOP
+
+
+.MBWG_SSE:
+
+       cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then
+       jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
+
+       #; gain is already in %xmm0
+       shufps  $0x00, %xmm0, %xmm0
+
+
+.MBWG_SSELOOP:
+
+       movaps  (%rsi), %xmm1 #; source => xmm0
+       mulps   %xmm0,  %xmm1 #; apply gain to source
+       addps   (%rdi), %xmm1 #; mix with destination
+       movaps  %xmm1, (%rdi) #; copy result to destination
+       
+       addq $16, %rdi #; dst+=4
+       addq $16, %rsi #; src+=4
+
+       subq $4, %rdx #; nframes-=4
+       cmp $4, %rdx
+       jge .MBWG_SSELOOP
+
+       cmp $0, %rdx
+       je .MBWG_END
+
+       #; if there are remaining frames, the nonalign code will do nicely
+       #; for the rest 1-3 frames.
+       
+.MBWG_NONALIGN:
+       #; not aligned!
+
+       #; gain is already in %xmm0
+
+.MBWG_NONALIGNLOOP:
+
+       movss (%rsi), %xmm1
+       mulss %xmm0, %xmm1
+       addss (%rdi), %xmm1
+       movss %xmm1, (%rdi)
+       
+       addq $4, %rdi
+       addq $4, %rsi
+       
+       decq %rdx
+       jnz .MBWG_NONALIGNLOOP
+
+.MBWG_END:
+
+       popq %rsi
+       popq %rdi
+       popq %rbx
+       
+       #; return
+       leave
+       ret
+
+.size  x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
+
+
+#; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int 
nframes);
+
+.globl x86_sse_mix_buffers_no_gain
+       .type   x86_sse_mix_buffers_no_gain,@function
+
+x86_sse_mix_buffers_no_gain:
+
+#; %rdi float *dst
+#; %rsi float *src
+#; %rdx unsigned int nframes
+
+       pushq %rbp
+       movq %rsp, %rbp
+
+       #; save the registers
+       pushq %rbx
+       pushq %rdi
+       pushq %rsi
+       
+       #; the real function
+
+       #; if nframes == 0, go to end
+       cmp     $0, %rdx
+       je      .MBNG_END
+
+       #; Check for alignment
+
+       movq %rdi, %rax
+       andq $12, %rax #; mask alignment offset
+
+       movq %rsi, %rbx
+       andq $12, %rbx #; mask alignment offset
+
+       cmp %rax, %rbx
+       jne .MBNG_NONALIGN #; if not aligned, calculate manually
+
+       cmp $0, %rbx
+       je .MBNG_SSE
+
+       #; Pre-loop, we need to run 1-3 frames "manually" without
+       #; SSE instructions
+
+.MBNG_PRELOOP:
+               
+       movss (%rsi), %xmm0
+       addss (%rdi), %xmm0
+       movss %xmm0, (%rdi)
+
+       addq $4, %rdi #; dst++
+       addq $4, %rsi #; src++
+       decq %rdx         #; nframes--
+       jz      .MBNG_END
+       addq $4, %rbx
+       
+       cmp $16, %rbx #; test if we've reached 16 byte alignment
+       jne .MBNG_PRELOOP
+
+.MBNG_SSE:
+
+       cmp $4, %rdx #; if there are frames left, but less than 4
+       jnge .MBNG_NONALIGN #; we can't run SSE
+
+.MBNG_SSELOOP:
+
+       movaps  (%rsi), %xmm0 #; source => xmm0
+       addps   (%rdi), %xmm0 #; mix with destination
+       movaps  %xmm0, (%rdi) #; copy result to destination
+       
+       addq $16, %rdi #; dst+=4
+       addq $16, %rsi #; src+=4
+
+       subq $4, %rdx #; nframes-=4
+       cmp $4, %rdx
+       jge .MBNG_SSELOOP
+
+       cmp $0, %rdx
+       je .MBNG_END
+
+       #; if there are remaining frames, the nonalign code will do nicely
+       #; for the rest 1-3 frames.
+       
+.MBNG_NONALIGN:
+       #; not aligned!
+
+       movss (%rsi), %xmm0 #; src => xmm0
+       addss (%rdi), %xmm0 #; xmm0 += dst
+       movss %xmm0, (%rdi) #; xmm0 => dst
+       
+       addq $4, %rdi
+       addq $4, %rsi
+       
+       decq %rdx
+       jnz .MBNG_NONALIGN
+
+.MBNG_END:
+
+       popq %rsi
+       popq %rdi
+       popq %rbx
+       
+       #; return
+       leave
+       ret
+
+.size  x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
+
+
+#; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float 
gain);
+
+.globl x86_sse_apply_gain_to_buffer
+       .type   x86_sse_apply_gain_to_buffer,@function
+
+x86_sse_apply_gain_to_buffer:
+
+#; %rdi         float          *buf    32(%rbp)
+#; %rsi  unsigned int  nframes
+#; %xmm0 float                 gain
+#; %xmm1 float         buf[0]
+
+       pushq %rbp
+       movq %rsp, %rbp
+
+       #; save %rdi
+       pushq %rdi
+       
+       #; the real function
+
+       #; if nframes == 0, go to end
+       movq %rsi, %rcx #; nframes
+       cmp     $0, %rcx
+       je      .AG_END
+
+       #; set up the gain buffer (gain is already in %xmm0)
+       shufps  $0x00, %xmm0, %xmm0
+       
+       #; Check for alignment
+
+       movq %rdi, %rdx #; buf => %rdx
+       andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+       jz      .AG_SSE #; if buffer IS aligned
+
+       #; PRE-LOOP
+       #; we iterate 1-3 times, doing normal x87 float comparison
+       #; so we reach a 16 byte aligned "buf" (=%rdi) value
+
+.AGLP_START:
+
+       #; Load next value from the buffer into %xmm1
+       movss (%rdi), %xmm1
+       mulss %xmm0, %xmm1
+       movss %xmm1, (%rdi)
+
+       #; increment buffer, decrement counter
+       addq $4, %rdi #; buf++;
+       
+       decq %rcx   #; nframes--
+       jz      .AG_END #; if we run out of frames, we go to the end
+       
+       addq $4, %rdx #; one non-aligned byte less
+       cmp $16, %rdx
+       jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
+
+.AG_SSE:
+
+       #; We have reached the 16 byte aligned "buf" ("rdi") value
+
+       #; Figure out how many loops we should do
+       movq %rcx, %rax #; copy remaining nframes to %rax for division
+       movq $0, %rdx   #; 0 the edx register
+       
+       
+       pushq %rdi
+       movq $4, %rdi
+       divq %rdi #; %rdx = remainder == 0
+       popq %rdi
+
+       #; %rax = SSE iterations
+       cmp $0, %rax
+       je .AGPOST_START
+
+       
+.AGLP_SSE:
+
+       movaps (%rdi), %xmm1
+       mulps %xmm0, %xmm1
+       movaps %xmm1, (%rdi)
+
+       addq $16, %rdi
+       subq $4, %rcx   #; nframes-=4
+
+       decq %rax
+       jnz .AGLP_SSE
+
+       #; Next we need to post-process all remaining frames
+       #; the remaining frame count is in %rcx
+       
+       #; if no remaining frames, jump to the end
+       cmp $0, %rcx
+       andq $3, %rcx #; nframes % 4
+       je .AG_END
+
+.AGPOST_START:
+
+       movss (%rdi), %xmm1
+       mulss %xmm0, %xmm1
+       movss %xmm1, (%rdi)
+
+       #; increment buffer, decrement counter
+       addq $4, %rdi #; buf++;
+       
+       decq %rcx   #; nframes--
+       jnz     .AGPOST_START #; if we run out of frames, we go to the end
+       
+.AG_END:
+
+
+       popq %rdi
+       
+       #; return
+       leave
+       ret
+
+.size  x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
+#; end proc
+
+
+#; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int 
nframes)
+
+.globl x86_sse_apply_gain_vector
+        .type   x86_sse_apply_gain_vector,@function
+
+x86_sse_apply_gain_vector:
+
+#; %rdi float *buf
+#; %rsi float *gain_vector
+#; %rdx unsigned int nframes
+
+       pushq %rbp
+       movq %rsp, %rbp
+
+       #; Save registers
+       pushq %rdi
+       pushq %rsi
+       pushq %rbx
+
+       #; if nframes == 0 go to end
+       cmp $0, %rdx
+       je .AGA_END
+               
+       #; Check alignment
+       movq %rdi, %rax
+       andq $12, %rax
+               
+       movq %rsi, %rbx
+       andq $12, %rbx
+
+       cmp %rax,%rbx
+       jne .AGA_ENDLOOP
+
+       cmp $0, %rax
+       jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop
+
+#; Buffers aren't 16 byte aligned, but they are unaligned by the same amount
+.AGA_ALIGNLOOP:
+               
+       movss (%rdi), %xmm0 #; buf => xmm0
+       movss (%rsi), %xmm1 #; gain value => xmm1
+       mulss %xmm1, %xmm0  #; xmm1 * xmm0 => xmm0
+       movss %xmm0, (%rdi) #; signal with gain => buf
+
+       decq %rdx
+       jz .AGA_END
+
+       addq $4, %rdi #; buf++
+       addq $4, %rsi #; gab++
+       
+       addq $4, %rax
+       cmp $16, %rax
+       jne .AGA_ALIGNLOOP
+       
+#; There are frames left for sure, as that is checked in the beginning
+#; and within the previous loop. BUT, there might be less than 4 frames
+#; to process
+
+.AGA_SSE:
+       movq %rdx, %rax #; nframes => %rax
+       shr $2, %rax #; unsigned divide by 4
+
+       cmp $0, %rax  #; Jos toimii ilman t�t�, niin kiva
+       je .AGA_ENDLOOP
+
+.AGA_SSELOOP:
+       movaps (%rdi), %xmm0
+       movaps (%rsi), %xmm1
+       mulps %xmm1, %xmm0
+       movaps %xmm0, (%rdi)
+
+       addq $16, %rdi
+       addq $16, %rsi
+
+       decq %rax
+       jnz .AGA_SSELOOP
+
+       andq $3, %rdx #; Remaining frames are nframes & 3
+       jz .AGA_END
+
+
+#; Inside this loop, we know there are frames left to process
+#; but because either there are < 4 frames left, or the buffers
+#; are not aligned, we can't use the parallel SSE ops
+.AGA_ENDLOOP:
+       movss (%rdi), %xmm0 #; buf => xmm0
+       movss (%rsi), %xmm1 #; gain value => xmm1
+       mulss %xmm1, %xmm0  #; xmm1 * xmm0 => xmm0
+       movss %xmm0, (%rdi) #; signal with gain => buf
+
+       addq $4,%rdi
+       addq $4,%rsi
+       decq %rdx #; nframes--
+       jnz .AGA_ENDLOOP
+
+.AGA_END:
+
+       popq %rbx
+       popq %rsi
+       popq %rdi
+
+       leave
+       ret
+
+.size  x86_sse_apply_gain_vector, .-x86_sse_apply_gain_vector
+#; end proc
+
+
+#; float x86_sse_compute_peak(float *buf, long nframes, float current);
+
+.globl x86_sse_compute_peak
+       .type   x86_sse_compute_peak,@function
+
+       
+x86_sse_compute_peak:
+
+#; %rdi         float          *buf    32(%rbp)
+#; %rsi         unsigned int   nframes
+#; %xmm0 float         current
+#; %xmm1 float         buf[0]
+
+       pushq %rbp
+       movq %rsp, %rbp
+
+       #; save %rdi
+       pushq %rdi
+       
+       #; if nframes == 0, go to end
+       movq %rsi, %rcx #; nframes
+       cmp     $0, %rcx
+       je      .CP_END
+
+       #; create the "abs" mask in %xmm2
+       pushq   $2147483647
+       movss   (%rsp), %xmm2
+       addq    $8, %rsp
+       shufps  $0x00, %xmm2, %xmm2
+
+       #; Check for alignment
+
+       #;movq 8(%rbp), %rdi #; buf 
+       movq %rdi, %rdx #; buf => %rdx
+       andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+       jz      .CP_SSE #; if buffer IS aligned
+
+       #; PRE-LOOP
+       #; we iterate 1-3 times, doing normal x87 float comparison
+       #; so we reach a 16 byte aligned "buf" (=%rdi) value
+
+.LP_START:
+
+       #; Load next value from the buffer
+       movss (%rdi), %xmm1
+       andps %xmm2, %xmm1
+       maxss %xmm1, %xmm0
+
+       #; increment buffer, decrement counter
+       addq $4, %rdi #; buf++;
+       
+       decq %rcx   #; nframes--
+       jz      .CP_END #; if we run out of frames, we go to the end
+       
+       addq $4, %rdx #; one non-aligned byte less
+       cmp $16, %rdx
+       jne .LP_START #; if more non-aligned frames exist, we do a do-over
+
+.CP_SSE:
+
+       #; We have reached the 16 byte aligned "buf" ("rdi") value
+
+       #; Figure out how many loops we should do
+       movq %rcx, %rax #; copy remaining nframes to %rax for division
+
+       shr $2,%rax #; unsigned divide by 4
+       jz .POST_START
+
+       #; %rax = SSE iterations
+
+       #; current maximum is at %xmm0, but we need to ..
+       shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
+
+       #;prefetcht0 16(%rdi)
+
+.LP_SSE:
+
+       movaps (%rdi), %xmm1
+       andps %xmm2, %xmm1
+       maxps %xmm1, %xmm0
+
+       addq $16, %rdi
+
+       decq %rax
+       jnz .LP_SSE
+
+       #; Calculate the maximum value contained in the 4 FP's in %xmm0
+       movaps %xmm0, %xmm1
+       shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
+       maxps  %xmm1, %xmm0 #; maximums of the two pairs
+       movaps %xmm0, %xmm1
+       shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs 
(1234 => 2143)
+       maxps  %xmm1, %xmm0 
+
+       #; now every float in %xmm0 is the same value, current maximum value
+       
+       #; Next we need to post-process all remaining frames
+       #; the remaining frame count is in %rcx
+       
+       #; if no remaining frames, jump to the end
+
+       andq $3, %rcx #; nframes % 4
+       jz .CP_END
+
+.POST_START:
+
+       movss (%rdi), %xmm1
+       andps %xmm2, %xmm1
+       maxss %xmm1, %xmm0
+       
+       addq $4, %rdi   #; buf++;
+       
+       decq %rcx               #; nframes--;
+       jnz .POST_START
+
+.CP_END:
+
+       popq %rdi
+       
+       #; return
+       leave
+       ret
+
+.size  x86_sse_compute_peak, .-x86_sse_compute_peak
+#; end proc
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
+

Index: src/common/sse_functions.S
===================================================================
RCS file: src/common/sse_functions.S
diff -N src/common/sse_functions.S
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ src/common/sse_functions.S  7 Nov 2007 20:28:20 -0000       1.1
@@ -0,0 +1,529 @@
+/*
+    Copyright (C) 2005 Sampo Savolainen
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA.
+
+    $Id: sse_functions.S,v 1.1 2007/11/07 20:28:20 r_sijrier Exp $
+*/
+
+
+#; void x86_sse_mix_buffers_with_gain (float *dst, float *src, long nframes, 
float gain);
+
+.globl x86_sse_mix_buffers_with_gain
+       .type   x86_sse_mix_buffers_with_gain,@function
+
+x86_sse_mix_buffers_with_gain:
+#; 8(%ebp)     = float *dst    = %edi
+#; 12(%ebp) = float *src       = %esi
+#; 16(%ebp) = long     nframes = %ecx
+#; 20(%ebp) = float    gain    = st(0)
+
+       pushl %ebp
+       movl %esp, %ebp
+
+       #; save the registers
+#;     pushl %eax
+       pushl %ebx
+#;     pushl %ecx
+       pushl %edi
+       pushl %esi
+       
+       #; if nframes == 0, go to end
+       movl 16(%ebp), %ecx #; nframes
+       cmp     $0, %ecx
+       je      .MBWG_END
+
+       #; Check for alignment
+
+       movl 8(%ebp), %edi  #; dst 
+       movl 12(%ebp), %esi #; src
+
+       movl %edi, %eax
+       andl $12, %eax #; mask alignemnt offset
+
+       movl %esi, %ebx
+       andl $12, %ebx #; mask alignment offset
+
+       cmp %eax, %ebx
+       jne .MBWG_NONALIGN #; if not aligned, calculate manually
+
+       #; if we are aligned
+       cmp $0, %ebx
+       jz .MBWG_SSE
+       
+       #; Pre-loop, we need to run 1-3 frames "manually" without
+       #; SSE instructions
+
+       movss 20(%ebp), %xmm1 #; xmm1
+
+.MBWG_PRELOOP:
+       
+       movss (%esi), %xmm0
+       mulss %xmm1, %xmm0
+       addss (%edi), %xmm0
+       movss %xmm0, (%edi)
+
+       addl $4, %edi #; dst++
+       addl $4, %esi #; src++
+       decl %ecx         #; nframes--
+       jz .MBWG_END
+
+#;     cmp $0, %ecx
+#;     je .MBWG_END #; if we run out of frames, go to end
+       
+       addl $4, %ebx
+       
+       cmp $16, %ebx #; test if we've reached 16 byte alignment
+       jne .MBWG_PRELOOP
+
+
+.MBWG_SSE:
+
+       cmp $4, %ecx #; we know it's not zero, but if it's not >=4, then
+       jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
+
+       #; copy gain to fill %xmm1
+       movss   20(%ebp), %xmm1
+    shufps  $0x00, %xmm1, %xmm1
+
+
+.MBWG_SSELOOP:
+
+       movaps  (%esi), %xmm0 #; source => xmm0
+       mulps   %xmm1,  %xmm0 #; apply gain to source
+       addps   (%edi), %xmm0 #; mix with destination
+       movaps  %xmm0, (%edi) #; copy result to destination
+       
+       addl $16, %edi #; dst+=4
+       addl $16, %esi #; src+=4
+
+       subl $4, %ecx #; nframes-=4
+       cmp $4, %ecx
+       jge .MBWG_SSELOOP
+
+       cmp $0, %ecx
+       je .MBWG_END
+
+       #; if there are remaining frames, the nonalign code will do nicely
+       #; for the rest 1-3 frames.
+       
+.MBWG_NONALIGN:
+       #; not aligned!
+
+       movss 20(%ebp), %xmm1 #; gain => xmm1
+
+.MBWG_NONALIGNLOOP:
+
+       movss (%esi), %xmm0
+       mulss %xmm1, %xmm0
+       addss (%edi), %xmm0
+       movss %xmm0, (%edi)
+       
+       addl $4, %edi
+       addl $4, %esi
+       
+       decl %ecx
+       jnz .MBWG_NONALIGNLOOP
+
+.MBWG_END:
+
+       popl %esi
+       popl %edi
+#;     popl %ecx
+       popl %ebx
+#;     popl %eax
+       
+       #; return
+       leave
+       ret
+
+.size  x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
+
+
+
+
+#; void x86_sse_mix_buffers_no_gain (float *dst, float *src, long nframes);
+
+.globl x86_sse_mix_buffers_no_gain
+       .type   x86_sse_mix_buffers_no_gain,@function
+
+x86_sse_mix_buffers_no_gain:
+#; 8(%ebp)     = float *dst    = %edi
+#; 12(%ebp) = float *src       = %esi
+#; 16(%ebp) = long     nframes = %ecx
+
+       pushl %ebp
+       movl %esp, %ebp
+
+       #; save the registers
+#;     pushl %eax
+       pushl %ebx
+#;     pushl %ecx
+       pushl %edi
+       pushl %esi
+       
+       #; the real function
+
+       #; if nframes == 0, go to end
+       movl 16(%ebp), %ecx #; nframes
+       cmp     $0, %ecx
+       je      .MBNG_END
+
+       #; Check for alignment
+
+       movl 8(%ebp), %edi  #; dst 
+       movl 12(%ebp), %esi #; src
+
+       movl %edi, %eax
+       andl $12, %eax #; mask alignemnt offset
+
+       movl %esi, %ebx
+       andl $12, %ebx #; mask alignment offset
+
+       cmp %eax, %ebx
+       jne .MBNG_NONALIGN #; if not aligned, calculate manually
+
+       cmp $0, %ebx
+       je .MBNG_SSE
+
+       #; Pre-loop, we need to run 1-3 frames "manually" without
+       #; SSE instructions
+
+.MBNG_PRELOOP:
+               
+       movss (%esi), %xmm0
+       addss (%edi), %xmm0
+       movss %xmm0, (%edi)
+
+       addl $4, %edi #; dst++
+       addl $4, %esi #; src++
+       decl %ecx         #; nframes--
+       jz      .MBNG_END
+       addl $4, %ebx
+       
+       cmp $16, %ebx #; test if we've reached 16 byte alignment
+       jne .MBNG_PRELOOP
+
+.MBNG_SSE:
+
+       cmp $4, %ecx #; if there are frames left, but less than 4
+       jnge .MBNG_NONALIGN #; we can't run SSE
+
+.MBNG_SSELOOP:
+
+       movaps  (%esi), %xmm0 #; source => xmm0
+       addps   (%edi), %xmm0 #; mix with destination
+       movaps  %xmm0, (%edi) #; copy result to destination
+       
+       addl $16, %edi #; dst+=4
+       addl $16, %esi #; src+=4
+
+       subl $4, %ecx #; nframes-=4
+       cmp $4, %ecx
+       jge .MBNG_SSELOOP
+
+       cmp $0, %ecx
+       je .MBNG_END
+
+       #; if there are remaining frames, the nonalign code will do nicely
+       #; for the rest 1-3 frames.
+       
+.MBNG_NONALIGN:
+       #; not aligned!
+
+       movss (%esi), %xmm0 #; src => xmm0
+       addss (%edi), %xmm0 #; xmm0 += dst
+       movss %xmm0, (%edi) #; xmm0 => dst
+       
+       addl $4, %edi
+       addl $4, %esi
+       
+       decl %ecx
+       jnz .MBNG_NONALIGN
+
+.MBNG_END:
+
+       popl %esi
+       popl %edi
+#;     popl %ecx
+       popl %ebx
+#;     popl %eax
+       
+       #; return
+       leave
+       ret
+
+.size  x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
+
+
+
+
+#; void x86_sse_apply_gain_to_buffer (float *buf, long nframes, float gain);
+
+.globl x86_sse_apply_gain_to_buffer
+       .type   x86_sse_apply_gain_to_buffer,@function
+
+x86_sse_apply_gain_to_buffer:
+#; 8(%ebp)     = float *buf    = %edi
+#; 12(%ebp) = long     nframes = %ecx
+#; 16(%ebp) = float    gain    = st(0)
+
+       pushl %ebp
+       movl %esp, %ebp
+
+       #; save %edi
+       pushl %edi
+       
+       #; the real function
+
+       #; if nframes == 0, go to end
+       movl 12(%ebp), %ecx #; nframes
+       cmp     $0, %ecx
+       je      .AG_END
+
+       #; create the gain buffer in %xmm1
+       movss   16(%ebp), %xmm1
+       shufps  $0x00, %xmm1, %xmm1
+       
+       #; Check for alignment
+
+       movl 8(%ebp), %edi #; buf 
+       movl %edi, %edx #; buf => %edx
+       andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+       jz      .AG_SSE #; if buffer IS aligned
+
+       #; PRE-LOOP
+       #; we iterate 1-3 times, doing normal x87 float comparison
+       #; so we reach a 16 byte aligned "buf" (=%edi) value
+
+.AGLP_START:
+
+       #; Load next value from the buffer
+       movss (%edi), %xmm0
+       mulss %xmm1, %xmm0
+       movss %xmm0, (%edi)
+
+       #; increment buffer, decrement counter
+       addl $4, %edi #; buf++;
+       
+       decl %ecx   #; nframes--
+       jz      .AG_END #; if we run out of frames, we go to the end
+       
+       addl $4, %edx #; one non-aligned byte less
+       cmp $16, %edx
+       jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
+
+.AG_SSE:
+
+       #; We have reached the 16 byte aligned "buf" ("edi") value
+
+       #; Figure out how many loops we should do
+       movl %ecx, %eax #; copy remaining nframes to %eax for division
+       movl $0, %edx   #; 0 the edx register
+       
+       
+       pushl %edi
+       movl $4, %edi
+       divl %edi #; %edx = remainder == 0
+       popl %edi
+
+       #; %eax = SSE iterations
+       cmp $0, %eax
+       je .AGPOST_START
+
+       
+.AGLP_SSE:
+
+       movaps (%edi), %xmm0
+       mulps %xmm1, %xmm0
+       movaps %xmm0, (%edi)
+
+       addl $16, %edi
+#;     subl $4, %ecx   #; nframes-=4
+
+       decl %eax
+       jnz .AGLP_SSE
+
+       #; Next we need to post-process all remaining frames
+       #; the remaining frame count is in %ecx
+       
+       #; if no remaining frames, jump to the end
+#;     cmp $0, %ecx
+       andl $3, %ecx #; nframes % 4
+       je .AG_END
+
+.AGPOST_START:
+
+       movss (%edi), %xmm0
+       mulss %xmm1, %xmm0
+       movss %xmm0, (%edi)
+
+       #; increment buffer, decrement counter
+       addl $4, %edi #; buf++;
+       
+       decl %ecx   #; nframes--
+       jnz     .AGPOST_START #; if we run out of frames, we go to the end
+       
+.AG_END:
+
+
+       popl %edi
+       
+       #; return
+       leave
+       ret
+
+.size  x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
+#; end proc
+
+
+
+#; float x86_sse_compute_peak(float *buf, long nframes, float current);
+
+.globl x86_sse_compute_peak
+       .type   x86_sse_compute_peak,@function
+
+abs_mask:
+       .long   2147483647
+
+       
+x86_sse_compute_peak:
+#; 8(%ebp)     = float *buf    = %edi
+#; 12(%ebp) = long     nframes = %ecx
+#; 16(%ebp) = float    current = st(0)
+
+       pushl %ebp
+       movl %esp, %ebp
+
+       #; save %edi
+       pushl %edi
+       
+       #; the real function
+
+       #; Load "current" in xmm0
+       movss 16(%ebp), %xmm0
+
+       #; if nframes == 0, go to end
+       movl 12(%ebp), %ecx #; nframes
+       cmp     $0, %ecx
+       je      .CP_END
+
+       #; create the "abs" mask in %xmm2
+       movss   abs_mask, %xmm2
+       shufps  $0x00, %xmm2, %xmm2
+
+       #; Check for alignment
+
+       movl 8(%ebp), %edi #; buf 
+       movl %edi, %edx #; buf => %edx
+       andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+       jz      .CP_SSE #; if buffer IS aligned
+
+       #; PRE-LOOP
+       #; we iterate 1-3 times, doing normal x87 float comparison
+       #; so we reach a 16 byte aligned "buf" (=%edi) value
+
+.LP_START:
+
+       #; Load next value from the buffer
+       movss (%edi), %xmm1
+       andps %xmm2, %xmm1
+       maxss %xmm1, %xmm0
+
+       #; increment buffer, decrement counter
+       addl $4, %edi #; buf++;
+       
+       decl %ecx   #; nframes--
+       jz      .CP_END #; if we run out of frames, we go to the end
+       
+       addl $4, %edx #; one non-aligned byte less
+       cmp $16, %edx
+       jne .LP_START #; if more non-aligned frames exist, we do a do-over
+
+.CP_SSE:
+
+       #; We have reached the 16 byte aligned "buf" ("edi") value
+
+       #; Figure out how many loops we should do
+       movl %ecx, %eax #; copy remaining nframes to %eax for division
+
+       shr $2,%eax #; unsigned divide by 4
+       jz .POST_START
+
+       #; %eax = SSE iterations
+
+       #; current maximum is at %xmm0, but we need to ..
+       shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
+
+       #;prefetcht0 16(%edi)
+
+.LP_SSE:
+
+       movaps (%edi), %xmm1
+       andps %xmm2, %xmm1
+       maxps %xmm1, %xmm0
+
+       addl $16, %edi
+
+       decl %eax
+       jnz .LP_SSE
+
+       #; Calculate the maximum value contained in the 4 FP's in %xmm0
+       movaps %xmm0, %xmm1
+       shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
+       maxps  %xmm1, %xmm0 #; maximums of the two pairs
+       movaps %xmm0, %xmm1
+       shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs 
(1234 => 2143)
+       maxps  %xmm1, %xmm0 
+
+       #; now every float in %xmm0 is the same value, current maximum value
+       
+       #; Next we need to post-process all remaining frames
+       #; the remaining frame count is in %ecx
+       
+       #; if no remaining frames, jump to the end
+
+       andl $3, %ecx #; nframes % 4
+       jz .CP_END
+
+.POST_START:
+
+       movss (%edi), %xmm1
+       andps %xmm2, %xmm1
+       maxss %xmm1, %xmm0
+       
+       addl $4, %edi   #; buf++;
+       
+       decl %ecx               #; nframes--;
+       jnz .POST_START
+
+.CP_END:
+
+       #; Load the value from xmm0 to the float stack for returning
+       movss %xmm0, 16(%ebp)
+       flds 16(%ebp)
+
+       popl %edi
+       
+       #; return
+       leave
+       ret
+
+.size  x86_sse_compute_peak, .-x86_sse_compute_peak
+#; end proc
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif




reply via email to

[Prev in Thread] Current Thread [Next in Thread]