[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Traverso-commit] traverso CMakeLists.txt src/core/CMakeLists.txt...
From: |
Remon Sijrier |
Subject: |
[Traverso-commit] traverso CMakeLists.txt src/core/CMakeLists.txt... |
Date: |
Wed, 07 Nov 2007 20:28:21 +0000 |
CVSROOT: /sources/traverso
Module name: traverso
Changes by: Remon Sijrier <r_sijrier> 07/11/07 20:28:21
Modified files:
. : CMakeLists.txt
src/core : CMakeLists.txt
src/engine : CMakeLists.txt
Added files:
src/common : sse_functions_64bit.S sse_functions.S
Log message:
* ported optimization stuff from qmake build system to cmake build
system. Somehow the runtime sse detection doesn't work :(
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/traverso/CMakeLists.txt?cvsroot=traverso&r1=1.7&r2=1.8
http://cvs.savannah.gnu.org/viewcvs/traverso/src/common/sse_functions_64bit.S?cvsroot=traverso&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/traverso/src/common/sse_functions.S?cvsroot=traverso&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/traverso/src/core/CMakeLists.txt?cvsroot=traverso&r1=1.2&r2=1.3
http://cvs.savannah.gnu.org/viewcvs/traverso/src/engine/CMakeLists.txt?cvsroot=traverso&r1=1.3&r2=1.4
Patches:
Index: CMakeLists.txt
===================================================================
RCS file: /sources/traverso/traverso/CMakeLists.txt,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -b -r1.7 -r1.8
--- CMakeLists.txt 7 Nov 2007 13:08:17 -0000 1.7
+++ CMakeLists.txt 7 Nov 2007 20:28:20 -0000 1.8
@@ -23,13 +23,13 @@
OPTION(WANT_ALSA "Include ALSA (Advanced Linux Sound Architecture)
support" ON)
OPTION(WANT_PORTAUDIO "Include PortAudio support (Driver support for Mac OS X
and Windows" ON)
OPTION(WANT_LV2 "Include LV2 Plugin support" ON)
-OPTION(USE_SYSTEM_SLV2_LIBRARY "Use system installed slv2 library. If not
found, the internal one is compiled and used which needs the rdf development
headers" OFF)
+OPTION(USE_SYSTEM_SLV2_LIBRARY "Use system installed slv2 library. If not
found, the internal one is compiled and used which needs the rdf development
headers" ON)
OPTION(WANT_MP3_DECODE "Include mp3 decoding support, for playing mp3 files"
ON)
OPTION(WANT_MP3_ENCODE "Include mp3 encoding support, for creating mp3 files"
ON)
-OPTION(WANT_PCH "Use precompiled headers" ON)
-OPTION(WANT_DEBUG "Debug build" ON)
+OPTION(WANT_PCH "Use precompiled headers" OFF)
+OPTION(WANT_DEBUG "Debug build" OFF)
OPTION(WANT_OPENGL "Build Traverso with OpenGL support" ON)
-OPTION(WANT_TRAVERSO_DEBUG "Provides 4 levels of debug ouput on the command
line" ON)
+OPTION(WANT_TRAVERSO_DEBUG "Provides 4 levels of debug ouput on the command
line, always on for DEBUG builds" OFF)
SET(MAIN_DIR_NAME "src")
@@ -289,22 +289,6 @@
ENDIF(FFTW3_FOUND)
-# Check GCC for PCH support
-SET(USE_PCH FALSE)
-IF(WANT_PCH)
- FIND_PACKAGE(PCHSupport)
- IF(PCHSupport_FOUND)
- SET(USE_PCH TRUE)
- SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -include
${CMAKE_SOURCE_DIR}/src/precompile.h -Winvalid-pch")
- SET(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -include
${CMAKE_SOURCE_DIR}/src/precompile.h -Winvalid-pch")
- MESSAGE(STATUS "Enabling precompiled headers for GCC
${gcc_compiler_version}")
- ENDIF(PCHSupport_FOUND)
-ELSE(WANT_PCH)
-#QT_DEFENITIONS are added by pch, so when no pch support add them now!
- ADD_DEFINITIONS(${QT_DEFINITIONS})
-ENDIF(WANT_PCH)
-
-
CHECK_INCLUDE_FILE("sys/vfs.h" HAVE_SYS_VFS_H)
IF(HAVE_SYS_VFS_H)
ADD_DEFINITIONS(-DHAVE_SYS_VFS_H)
@@ -357,6 +341,87 @@
ENDIF(APPLE)
+CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)
+IF(NOT HAVE_POSIX_MEMALIGN)
+ ADD_DEFINITIONS(-DNO_POSIX_MEMALIGN)
+ MESSAGE("Did not find posix_memalign(), using malloc")
+ENDIF(NOT HAVE_POSIX_MEMALIGN)
+
+
+execute_process(
+ COMMAND uname -m
+ OUTPUT_VARIABLE uname_invoke_result
+ RESULT_VARIABLE uname_failed
+)
+
+execute_process(
+ COMMAND cat /proc/cpuinfo
+ COMMAND grep ^flags
+ OUTPUT_VARIABLE cat_invoke_result
+ RESULT_VARIABLE cat_failed
+)
+
+IF(UNIX)
+ IF(NOT WANT_DEBUG)
+ SET(HOST_SUPPORTS_SSE FALSE)
+
+ IF(cat_invoke_result MATCHES sse)
+ SET(HOST_SUPPORTS_SSE TRUE)
+ ADD_DEFINITIONS(-DSSE_OPTIMIZATIONS)
+ ENDIF(cat_invoke_result MATCHES sse)
+
+ IF(cat_invoke_result MATCHES mmx)
+ SET(CMAKE_CXX_FLAGS_RELEASE
"${CMAKE_CXX_FLAGS_RELEASE} -mmmx")
+ ENDIF(cat_invoke_result MATCHES mmx)
+
+ IF(cat_invoke_result MATCHES 3dnow)
+ SET(CMAKE_CXX_FLAGS_RELEASE
"${CMAKE_CXX_FLAGS_RELEASE} -m3dnow")
+ ENDIF(cat_invoke_result MATCHES 3dnow)
+
+ IF(uname_invoke_result MATCHES i586)
+ SET(CMAKE_CXX_FLAGS_RELEASE
"${CMAKE_CXX_FLAGS_RELEASE} -march=i586")
+ ENDIF(uname_invoke_result MATCHES i586)
+
+ IF(uname_invoke_result MATCHES i686)
+ SET(uname_invoke_result "${CMAKE_CXX_FLAGS_RELEASE}
-march=i686")
+ IF(HOST_SUPPORTS_SSE)
+ SET(CMAKE_CXX_FLAGS_RELEASE
"${CMAKE_CXX_FLAGS_RELEASE} -msse -mfpmath=sse")
+ ADD_DEFINITIONS(-DUSE_XMMINTRIN)
+ ENDIF(HOST_SUPPORTS_SSE)
+ ENDIF(uname_invoke_result MATCHES i686)
+
+ IF(uname_invoke_result MATCHES x86_64)
+ IF(HOST_SUPPORTS_SSE)
+ SET(CMAKE_CXX_FLAGS_RELEASE
"${CMAKE_CXX_FLAGS_RELEASE} -msse -mfpmath=sse")
+ ADD_DEFINITIONS(-DUSE_XMMINTRIN
-DUSE_X86_64_ASM)
+ SET(IS_ARCH_X86_64 TRUE)
+ ENDIF(HOST_SUPPORTS_SSE)
+ ENDIF(uname_invoke_result MATCHES x86_64)
+
+ IF(uname_invoke_result MATCHES i[456]86)
+ ADD_DEFINITIONS(-DARCH_X86)
+ SET(IS_ARCH_X86 TRUE)
+ ENDIF(uname_invoke_result MATCHES i[456]86)
+ ENDIF(NOT WANT_DEBUG)
+ENDIF(UNIX)
+
+# Check GCC for PCH support
+SET(USE_PCH FALSE)
+IF(WANT_PCH)
+ FIND_PACKAGE(PCHSupport)
+ IF(PCHSupport_FOUND)
+ SET(USE_PCH TRUE)
+ SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -include
${CMAKE_SOURCE_DIR}/src/precompile.h -Winvalid-pch")
+ SET(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -include
${CMAKE_SOURCE_DIR}/src/precompile.h -Winvalid-pch")
+ MESSAGE(STATUS "Enabling precompiled headers for GCC
${gcc_compiler_version}")
+ ENDIF(PCHSupport_FOUND)
+ELSE(WANT_PCH)
+#QT_DEFENITIONS are added by pch, so when no pch support add them now!
+ ADD_DEFINITIONS(${QT_DEFINITIONS})
+ENDIF(WANT_PCH)
+
+
+
#Add our source subdirs
ADD_SUBDIRECTORY(src)
Index: src/core/CMakeLists.txt
===================================================================
RCS file: /sources/traverso/traverso/src/core/CMakeLists.txt,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -b -r1.2 -r1.3
--- src/core/CMakeLists.txt 5 Nov 2007 22:28:11 -0000 1.2
+++ src/core/CMakeLists.txt 7 Nov 2007 20:28:20 -0000 1.3
@@ -85,9 +85,27 @@
QT4_WRAP_CPP(TRAVERSO_CORE_MOC_SOURCES ${TRAVERSO_CORE_MOC_CLASSES})
+IF(UNIX)
+ IF(HOST_SUPPORTS_SSE)
+ IF(IS_ARCH_X86)
+ SET(ASM_FILE
${CMAKE_SOURCE_DIR}/src/common/sse_functions.S)
+ ENDIF(IS_ARCH_X86)
+ IF(IS_ARCH_X86_64)
+ SET(ASM_FILE
${CMAKE_SOURCE_DIR}/src/common/sse_functions_64bit.S)
+ ENDIF(IS_ARCH_X86_64)
+
+ add_custom_command(OUTPUT
${CMAKE_CURRENT_BINARY_DIR}/sse_functions.o
+ COMMAND gcc -c -pipe -O2 -fPIC ${ASM_FILE} -o
${CMAKE_CURRENT_BINARY_DIR}/sse_functions.o
+ DEPENDS ${ASM_FILE}
+ )
+ SET(SSE_OBJECT_FILE ${CMAKE_CURRENT_BINARY_DIR}/sse_functions.o)
+ ENDIF(HOST_SUPPORTS_SSE)
+ENDIF(UNIX)
+
+
SET(TRAVERSO_CORE_LIBRARY "traversocore")
-ADD_LIBRARY(${TRAVERSO_CORE_LIBRARY} STATIC ${TRAVERSO_CORE_SOURCES}
${TRAVERSO_CORE_MOC_SOURCES})
+ADD_LIBRARY(${TRAVERSO_CORE_LIBRARY} STATIC ${TRAVERSO_CORE_SOURCES}
${TRAVERSO_CORE_MOC_SOURCES} ${SSE_OBJECT_FILE})
IF(USE_PCH)
ADD_DEPENDENCIES(traversocore precompiled_headers)
Index: src/engine/CMakeLists.txt
===================================================================
RCS file: /sources/traverso/traverso/src/engine/CMakeLists.txt,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -b -r1.3 -r1.4
--- src/engine/CMakeLists.txt 6 Nov 2007 18:25:16 -0000 1.3
+++ src/engine/CMakeLists.txt 7 Nov 2007 20:28:21 -0000 1.4
@@ -43,7 +43,6 @@
)
ENDIF(HAVE_PORTAUDIO)
-
SET(TRAVERSO_ENGINE_LIBRARY "traversoaudiobackend")
QT4_WRAP_CPP(TRAVERSO_ENGINE_MOC_SOURCES ${TRAVERSO_ENGINE_MOC_CLASSES})
Index: src/common/sse_functions_64bit.S
===================================================================
RCS file: src/common/sse_functions_64bit.S
diff -N src/common/sse_functions_64bit.S
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/common/sse_functions_64bit.S 7 Nov 2007 20:28:20 -0000 1.1
@@ -0,0 +1,609 @@
+/*
+ Copyright (C) 2005-2006 Paul Davis, John Rigg
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ Author: Sampo Savolainen
+ 64-bit conversion: John Rigg
+
+ $Id: sse_functions_64bit.S,v 1.1 2007/11/07 20:28:20 r_sijrier Exp $
+*/
+
+
+#; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int
nframes, float gain);
+
+.globl x86_sse_mix_buffers_with_gain
+ .type x86_sse_mix_buffers_with_gain,@function
+
+x86_sse_mix_buffers_with_gain:
+
+#; %rdi float *dst
+#; %rsi float *src
+#; %rdx unsigned int nframes
+#; %xmm0 float gain
+
+ pushq %rbp
+ movq %rsp, %rbp
+
+ #; save the registers
+ pushq %rbx
+ pushq %rdi
+ pushq %rsi
+
+ #; if nframes == 0, go to end
+ cmp $0, %rdx
+ je .MBWG_END
+
+ #; Check for alignment
+
+ movq %rdi, %rax
+ andq $12, %rax #; mask alignment offset
+
+ movq %rsi, %rbx
+ andq $12, %rbx #; mask alignment offset
+
+ cmp %rax, %rbx
+ jne .MBWG_NONALIGN #; if not aligned, calculate manually
+
+ #; if we are aligned
+ cmp $0, %rbx
+ jz .MBWG_SSE
+
+ #; Pre-loop, we need to run 1-3 frames "manually" without
+ #; SSE instructions
+
+.MBWG_PRELOOP:
+
+ #; gain is already in %xmm0
+ movss (%rsi), %xmm1
+ mulss %xmm0, %xmm1
+ addss (%rdi), %xmm1
+ movss %xmm1, (%rdi)
+
+ addq $4, %rdi #; dst++
+ addq $4, %rsi #; src++
+ decq %rdx #; nframes--
+ jz .MBWG_END
+
+ addq $4, %rbx
+
+ cmp $16, %rbx #; test if we've reached 16 byte alignment
+ jne .MBWG_PRELOOP
+
+
+.MBWG_SSE:
+
+ cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then
+ jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
+
+ #; gain is already in %xmm0
+ shufps $0x00, %xmm0, %xmm0
+
+
+.MBWG_SSELOOP:
+
+ movaps (%rsi), %xmm1 #; source => xmm0
+ mulps %xmm0, %xmm1 #; apply gain to source
+ addps (%rdi), %xmm1 #; mix with destination
+ movaps %xmm1, (%rdi) #; copy result to destination
+
+ addq $16, %rdi #; dst+=4
+ addq $16, %rsi #; src+=4
+
+ subq $4, %rdx #; nframes-=4
+ cmp $4, %rdx
+ jge .MBWG_SSELOOP
+
+ cmp $0, %rdx
+ je .MBWG_END
+
+ #; if there are remaining frames, the nonalign code will do nicely
+ #; for the rest 1-3 frames.
+
+.MBWG_NONALIGN:
+ #; not aligned!
+
+ #; gain is already in %xmm0
+
+.MBWG_NONALIGNLOOP:
+
+ movss (%rsi), %xmm1
+ mulss %xmm0, %xmm1
+ addss (%rdi), %xmm1
+ movss %xmm1, (%rdi)
+
+ addq $4, %rdi
+ addq $4, %rsi
+
+ decq %rdx
+ jnz .MBWG_NONALIGNLOOP
+
+.MBWG_END:
+
+ popq %rsi
+ popq %rdi
+ popq %rbx
+
+ #; return
+ leave
+ ret
+
+.size x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
+
+
+#; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int
nframes);
+
+.globl x86_sse_mix_buffers_no_gain
+ .type x86_sse_mix_buffers_no_gain,@function
+
+x86_sse_mix_buffers_no_gain:
+
+#; %rdi float *dst
+#; %rsi float *src
+#; %rdx unsigned int nframes
+
+ pushq %rbp
+ movq %rsp, %rbp
+
+ #; save the registers
+ pushq %rbx
+ pushq %rdi
+ pushq %rsi
+
+ #; the real function
+
+ #; if nframes == 0, go to end
+ cmp $0, %rdx
+ je .MBNG_END
+
+ #; Check for alignment
+
+ movq %rdi, %rax
+ andq $12, %rax #; mask alignment offset
+
+ movq %rsi, %rbx
+ andq $12, %rbx #; mask alignment offset
+
+ cmp %rax, %rbx
+ jne .MBNG_NONALIGN #; if not aligned, calculate manually
+
+ cmp $0, %rbx
+ je .MBNG_SSE
+
+ #; Pre-loop, we need to run 1-3 frames "manually" without
+ #; SSE instructions
+
+.MBNG_PRELOOP:
+
+ movss (%rsi), %xmm0
+ addss (%rdi), %xmm0
+ movss %xmm0, (%rdi)
+
+ addq $4, %rdi #; dst++
+ addq $4, %rsi #; src++
+ decq %rdx #; nframes--
+ jz .MBNG_END
+ addq $4, %rbx
+
+ cmp $16, %rbx #; test if we've reached 16 byte alignment
+ jne .MBNG_PRELOOP
+
+.MBNG_SSE:
+
+ cmp $4, %rdx #; if there are frames left, but less than 4
+ jnge .MBNG_NONALIGN #; we can't run SSE
+
+.MBNG_SSELOOP:
+
+ movaps (%rsi), %xmm0 #; source => xmm0
+ addps (%rdi), %xmm0 #; mix with destination
+ movaps %xmm0, (%rdi) #; copy result to destination
+
+ addq $16, %rdi #; dst+=4
+ addq $16, %rsi #; src+=4
+
+ subq $4, %rdx #; nframes-=4
+ cmp $4, %rdx
+ jge .MBNG_SSELOOP
+
+ cmp $0, %rdx
+ je .MBNG_END
+
+ #; if there are remaining frames, the nonalign code will do nicely
+ #; for the rest 1-3 frames.
+
+.MBNG_NONALIGN:
+ #; not aligned!
+
+ movss (%rsi), %xmm0 #; src => xmm0
+ addss (%rdi), %xmm0 #; xmm0 += dst
+ movss %xmm0, (%rdi) #; xmm0 => dst
+
+ addq $4, %rdi
+ addq $4, %rsi
+
+ decq %rdx
+ jnz .MBNG_NONALIGN
+
+.MBNG_END:
+
+ popq %rsi
+ popq %rdi
+ popq %rbx
+
+ #; return
+ leave
+ ret
+
+.size x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
+
+
+#; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float
gain);
+
+.globl x86_sse_apply_gain_to_buffer
+ .type x86_sse_apply_gain_to_buffer,@function
+
+x86_sse_apply_gain_to_buffer:
+
+#; %rdi float *buf 32(%rbp)
+#; %rsi unsigned int nframes
+#; %xmm0 float gain
+#; %xmm1 float buf[0]
+
+ pushq %rbp
+ movq %rsp, %rbp
+
+ #; save %rdi
+ pushq %rdi
+
+ #; the real function
+
+ #; if nframes == 0, go to end
+ movq %rsi, %rcx #; nframes
+ cmp $0, %rcx
+ je .AG_END
+
+ #; set up the gain buffer (gain is already in %xmm0)
+ shufps $0x00, %xmm0, %xmm0
+
+ #; Check for alignment
+
+ movq %rdi, %rdx #; buf => %rdx
+ andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+ jz .AG_SSE #; if buffer IS aligned
+
+ #; PRE-LOOP
+ #; we iterate 1-3 times, doing normal x87 float comparison
+ #; so we reach a 16 byte aligned "buf" (=%rdi) value
+
+.AGLP_START:
+
+ #; Load next value from the buffer into %xmm1
+ movss (%rdi), %xmm1
+ mulss %xmm0, %xmm1
+ movss %xmm1, (%rdi)
+
+ #; increment buffer, decrement counter
+ addq $4, %rdi #; buf++;
+
+ decq %rcx #; nframes--
+ jz .AG_END #; if we run out of frames, we go to the end
+
+ addq $4, %rdx #; one non-aligned byte less
+ cmp $16, %rdx
+ jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
+
+.AG_SSE:
+
+ #; We have reached the 16 byte aligned "buf" ("rdi") value
+
+ #; Figure out how many loops we should do
+ movq %rcx, %rax #; copy remaining nframes to %rax for division
+ movq $0, %rdx #; 0 the edx register
+
+
+ pushq %rdi
+ movq $4, %rdi
+ divq %rdi #; %rdx = remainder == 0
+ popq %rdi
+
+ #; %rax = SSE iterations
+ cmp $0, %rax
+ je .AGPOST_START
+
+
+.AGLP_SSE:
+
+ movaps (%rdi), %xmm1
+ mulps %xmm0, %xmm1
+ movaps %xmm1, (%rdi)
+
+ addq $16, %rdi
+ subq $4, %rcx #; nframes-=4
+
+ decq %rax
+ jnz .AGLP_SSE
+
+ #; Next we need to post-process all remaining frames
+ #; the remaining frame count is in %rcx
+
+ #; if no remaining frames, jump to the end
+ cmp $0, %rcx
+ andq $3, %rcx #; nframes % 4
+ je .AG_END
+
+.AGPOST_START:
+
+ movss (%rdi), %xmm1
+ mulss %xmm0, %xmm1
+ movss %xmm1, (%rdi)
+
+ #; increment buffer, decrement counter
+ addq $4, %rdi #; buf++;
+
+ decq %rcx #; nframes--
+ jnz .AGPOST_START #; if we run out of frames, we go to the end
+
+.AG_END:
+
+
+ popq %rdi
+
+ #; return
+ leave
+ ret
+
+.size x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
+#; end proc
+
+
+#; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int
nframes)
+
+.globl x86_sse_apply_gain_vector
+ .type x86_sse_apply_gain_vector,@function
+
+x86_sse_apply_gain_vector:
+
+#; %rdi float *buf
+#; %rsi float *gain_vector
+#; %rdx unsigned int nframes
+
+ pushq %rbp
+ movq %rsp, %rbp
+
+ #; Save registers
+ pushq %rdi
+ pushq %rsi
+ pushq %rbx
+
+ #; if nframes == 0 go to end
+ cmp $0, %rdx
+ je .AGA_END
+
+ #; Check alignment
+ movq %rdi, %rax
+ andq $12, %rax
+
+ movq %rsi, %rbx
+ andq $12, %rbx
+
+ cmp %rax,%rbx
+ jne .AGA_ENDLOOP
+
+ cmp $0, %rax
+ jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop
+
+#; Buffers aren't 16 byte aligned, but they are unaligned by the same amount
+.AGA_ALIGNLOOP:
+
+ movss (%rdi), %xmm0 #; buf => xmm0
+ movss (%rsi), %xmm1 #; gain value => xmm1
+ mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
+ movss %xmm0, (%rdi) #; signal with gain => buf
+
+ decq %rdx
+ jz .AGA_END
+
+ addq $4, %rdi #; buf++
+ addq $4, %rsi #; gab++
+
+ addq $4, %rax
+ cmp $16, %rax
+ jne .AGA_ALIGNLOOP
+
+#; There are frames left for sure, as that is checked in the beginning
+#; and within the previous loop. BUT, there might be less than 4 frames
+#; to process
+
+.AGA_SSE:
+ movq %rdx, %rax #; nframes => %rax
+ shr $2, %rax #; unsigned divide by 4
+
+ cmp $0, %rax #; Jos toimii ilman t�t�, niin kiva
+ je .AGA_ENDLOOP
+
+.AGA_SSELOOP:
+ movaps (%rdi), %xmm0
+ movaps (%rsi), %xmm1
+ mulps %xmm1, %xmm0
+ movaps %xmm0, (%rdi)
+
+ addq $16, %rdi
+ addq $16, %rsi
+
+ decq %rax
+ jnz .AGA_SSELOOP
+
+ andq $3, %rdx #; Remaining frames are nframes & 3
+ jz .AGA_END
+
+
+#; Inside this loop, we know there are frames left to process
+#; but because either there are < 4 frames left, or the buffers
+#; are not aligned, we can't use the parallel SSE ops
+.AGA_ENDLOOP:
+ movss (%rdi), %xmm0 #; buf => xmm0
+ movss (%rsi), %xmm1 #; gain value => xmm1
+ mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
+ movss %xmm0, (%rdi) #; signal with gain => buf
+
+ addq $4,%rdi
+ addq $4,%rsi
+ decq %rdx #; nframes--
+ jnz .AGA_ENDLOOP
+
+.AGA_END:
+
+ popq %rbx
+ popq %rsi
+ popq %rdi
+
+ leave
+ ret
+
+.size x86_sse_apply_gain_vector, .-x86_sse_apply_gain_vector
+#; end proc
+
+
+#; float x86_sse_compute_peak(float *buf, long nframes, float current);
+
+.globl x86_sse_compute_peak
+ .type x86_sse_compute_peak,@function
+
+
+x86_sse_compute_peak:
+
+#; %rdi float *buf 32(%rbp)
+#; %rsi unsigned int nframes
+#; %xmm0 float current
+#; %xmm1 float buf[0]
+
+ pushq %rbp
+ movq %rsp, %rbp
+
+ #; save %rdi
+ pushq %rdi
+
+ #; if nframes == 0, go to end
+ movq %rsi, %rcx #; nframes
+ cmp $0, %rcx
+ je .CP_END
+
+ #; create the "abs" mask in %xmm2
+ pushq $2147483647
+ movss (%rsp), %xmm2
+ addq $8, %rsp
+ shufps $0x00, %xmm2, %xmm2
+
+ #; Check for alignment
+
+ #;movq 8(%rbp), %rdi #; buf
+ movq %rdi, %rdx #; buf => %rdx
+ andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+ jz .CP_SSE #; if buffer IS aligned
+
+ #; PRE-LOOP
+ #; we iterate 1-3 times, doing normal x87 float comparison
+ #; so we reach a 16 byte aligned "buf" (=%rdi) value
+
+.LP_START:
+
+ #; Load next value from the buffer
+ movss (%rdi), %xmm1
+ andps %xmm2, %xmm1
+ maxss %xmm1, %xmm0
+
+ #; increment buffer, decrement counter
+ addq $4, %rdi #; buf++;
+
+ decq %rcx #; nframes--
+ jz .CP_END #; if we run out of frames, we go to the end
+
+ addq $4, %rdx #; one non-aligned byte less
+ cmp $16, %rdx
+ jne .LP_START #; if more non-aligned frames exist, we do a do-over
+
+.CP_SSE:
+
+ #; We have reached the 16 byte aligned "buf" ("rdi") value
+
+ #; Figure out how many loops we should do
+ movq %rcx, %rax #; copy remaining nframes to %rax for division
+
+ shr $2,%rax #; unsigned divide by 4
+ jz .POST_START
+
+ #; %rax = SSE iterations
+
+ #; current maximum is at %xmm0, but we need to ..
+ shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
+
+ #;prefetcht0 16(%rdi)
+
+.LP_SSE:
+
+ movaps (%rdi), %xmm1
+ andps %xmm2, %xmm1
+ maxps %xmm1, %xmm0
+
+ addq $16, %rdi
+
+ decq %rax
+ jnz .LP_SSE
+
+ #; Calculate the maximum value contained in the 4 FP's in %xmm0
+ movaps %xmm0, %xmm1
+ shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
+ maxps %xmm1, %xmm0 #; maximums of the two pairs
+ movaps %xmm0, %xmm1
+ shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs
(1234 => 2143)
+ maxps %xmm1, %xmm0
+
+ #; now every float in %xmm0 is the same value, current maximum value
+
+ #; Next we need to post-process all remaining frames
+ #; the remaining frame count is in %rcx
+
+ #; if no remaining frames, jump to the end
+
+ andq $3, %rcx #; nframes % 4
+ jz .CP_END
+
+.POST_START:
+
+ movss (%rdi), %xmm1
+ andps %xmm2, %xmm1
+ maxss %xmm1, %xmm0
+
+ addq $4, %rdi #; buf++;
+
+ decq %rcx #; nframes--;
+ jnz .POST_START
+
+.CP_END:
+
+ popq %rdi
+
+ #; return
+ leave
+ ret
+
+.size x86_sse_compute_peak, .-x86_sse_compute_peak
+#; end proc
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
+
Index: src/common/sse_functions.S
===================================================================
RCS file: src/common/sse_functions.S
diff -N src/common/sse_functions.S
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/common/sse_functions.S 7 Nov 2007 20:28:20 -0000 1.1
@@ -0,0 +1,529 @@
+/*
+ Copyright (C) 2005 Sampo Savolainen
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+
+ $Id: sse_functions.S,v 1.1 2007/11/07 20:28:20 r_sijrier Exp $
+*/
+
+
+#; void x86_sse_mix_buffers_with_gain (float *dst, float *src, long nframes,
float gain);
+
+.globl x86_sse_mix_buffers_with_gain
+ .type x86_sse_mix_buffers_with_gain,@function
+
+x86_sse_mix_buffers_with_gain:
+#; 8(%ebp) = float *dst = %edi
+#; 12(%ebp) = float *src = %esi
+#; 16(%ebp) = long nframes = %ecx
+#; 20(%ebp) = float gain = st(0)
+
+ pushl %ebp
+ movl %esp, %ebp
+
+ #; save the registers
+#; pushl %eax
+ pushl %ebx
+#; pushl %ecx
+ pushl %edi
+ pushl %esi
+
+ #; if nframes == 0, go to end
+ movl 16(%ebp), %ecx #; nframes
+ cmp $0, %ecx
+ je .MBWG_END
+
+ #; Check for alignment
+
+ movl 8(%ebp), %edi #; dst
+ movl 12(%ebp), %esi #; src
+
+ movl %edi, %eax
+ andl $12, %eax #; mask alignemnt offset
+
+ movl %esi, %ebx
+ andl $12, %ebx #; mask alignment offset
+
+ cmp %eax, %ebx
+ jne .MBWG_NONALIGN #; if not aligned, calculate manually
+
+ #; if we are aligned
+ cmp $0, %ebx
+ jz .MBWG_SSE
+
+ #; Pre-loop, we need to run 1-3 frames "manually" without
+ #; SSE instructions
+
+ movss 20(%ebp), %xmm1 #; xmm1
+
+.MBWG_PRELOOP:
+
+ movss (%esi), %xmm0
+ mulss %xmm1, %xmm0
+ addss (%edi), %xmm0
+ movss %xmm0, (%edi)
+
+ addl $4, %edi #; dst++
+ addl $4, %esi #; src++
+ decl %ecx #; nframes--
+ jz .MBWG_END
+
+#; cmp $0, %ecx
+#; je .MBWG_END #; if we run out of frames, go to end
+
+ addl $4, %ebx
+
+ cmp $16, %ebx #; test if we've reached 16 byte alignment
+ jne .MBWG_PRELOOP
+
+
+.MBWG_SSE:
+
+ cmp $4, %ecx #; we know it's not zero, but if it's not >=4, then
+ jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
+
+ #; copy gain to fill %xmm1
+ movss 20(%ebp), %xmm1
+ shufps $0x00, %xmm1, %xmm1
+
+
+.MBWG_SSELOOP:
+
+ movaps (%esi), %xmm0 #; source => xmm0
+ mulps %xmm1, %xmm0 #; apply gain to source
+ addps (%edi), %xmm0 #; mix with destination
+ movaps %xmm0, (%edi) #; copy result to destination
+
+ addl $16, %edi #; dst+=4
+ addl $16, %esi #; src+=4
+
+ subl $4, %ecx #; nframes-=4
+ cmp $4, %ecx
+ jge .MBWG_SSELOOP
+
+ cmp $0, %ecx
+ je .MBWG_END
+
+ #; if there are remaining frames, the nonalign code will do nicely
+ #; for the rest 1-3 frames.
+
+.MBWG_NONALIGN:
+ #; not aligned!
+
+ movss 20(%ebp), %xmm1 #; gain => xmm1
+
+.MBWG_NONALIGNLOOP:
+
+ movss (%esi), %xmm0
+ mulss %xmm1, %xmm0
+ addss (%edi), %xmm0
+ movss %xmm0, (%edi)
+
+ addl $4, %edi
+ addl $4, %esi
+
+ decl %ecx
+ jnz .MBWG_NONALIGNLOOP
+
+.MBWG_END:
+
+ popl %esi
+ popl %edi
+#; popl %ecx
+ popl %ebx
+#; popl %eax
+
+ #; return
+ leave
+ ret
+
+.size x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
+
+
+
+
+#; void x86_sse_mix_buffers_no_gain (float *dst, float *src, long nframes);
+
+.globl x86_sse_mix_buffers_no_gain
+ .type x86_sse_mix_buffers_no_gain,@function
+
+x86_sse_mix_buffers_no_gain:
+#; 8(%ebp) = float *dst = %edi
+#; 12(%ebp) = float *src = %esi
+#; 16(%ebp) = long nframes = %ecx
+
+ pushl %ebp
+ movl %esp, %ebp
+
+ #; save the registers
+#; pushl %eax
+ pushl %ebx
+#; pushl %ecx
+ pushl %edi
+ pushl %esi
+
+ #; the real function
+
+ #; if nframes == 0, go to end
+ movl 16(%ebp), %ecx #; nframes
+ cmp $0, %ecx
+ je .MBNG_END
+
+ #; Check for alignment
+
+ movl 8(%ebp), %edi #; dst
+ movl 12(%ebp), %esi #; src
+
+ movl %edi, %eax
+ andl $12, %eax #; mask alignemnt offset
+
+ movl %esi, %ebx
+ andl $12, %ebx #; mask alignment offset
+
+ cmp %eax, %ebx
+ jne .MBNG_NONALIGN #; if not aligned, calculate manually
+
+ cmp $0, %ebx
+ je .MBNG_SSE
+
+ #; Pre-loop, we need to run 1-3 frames "manually" without
+ #; SSE instructions
+
+.MBNG_PRELOOP:
+
+ movss (%esi), %xmm0
+ addss (%edi), %xmm0
+ movss %xmm0, (%edi)
+
+ addl $4, %edi #; dst++
+ addl $4, %esi #; src++
+ decl %ecx #; nframes--
+ jz .MBNG_END
+ addl $4, %ebx
+
+ cmp $16, %ebx #; test if we've reached 16 byte alignment
+ jne .MBNG_PRELOOP
+
+.MBNG_SSE:
+
+ cmp $4, %ecx #; if there are frames left, but less than 4
+ jnge .MBNG_NONALIGN #; we can't run SSE
+
+.MBNG_SSELOOP:
+
+ movaps (%esi), %xmm0 #; source => xmm0
+ addps (%edi), %xmm0 #; mix with destination
+ movaps %xmm0, (%edi) #; copy result to destination
+
+ addl $16, %edi #; dst+=4
+ addl $16, %esi #; src+=4
+
+ subl $4, %ecx #; nframes-=4
+ cmp $4, %ecx
+ jge .MBNG_SSELOOP
+
+ cmp $0, %ecx
+ je .MBNG_END
+
+ #; if there are remaining frames, the nonalign code will do nicely
+ #; for the rest 1-3 frames.
+
+.MBNG_NONALIGN:
+ #; not aligned!
+
+ movss (%esi), %xmm0 #; src => xmm0
+ addss (%edi), %xmm0 #; xmm0 += dst
+ movss %xmm0, (%edi) #; xmm0 => dst
+
+ addl $4, %edi
+ addl $4, %esi
+
+ decl %ecx
+ jnz .MBNG_NONALIGN
+
+.MBNG_END:
+
+ popl %esi
+ popl %edi
+#; popl %ecx
+ popl %ebx
+#; popl %eax
+
+ #; return
+ leave
+ ret
+
+.size x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
+
+
+
+
+#; void x86_sse_apply_gain_to_buffer (float *buf, long nframes, float gain);
+
+.globl x86_sse_apply_gain_to_buffer
+ .type x86_sse_apply_gain_to_buffer,@function
+
+x86_sse_apply_gain_to_buffer:
+#; 8(%ebp) = float *buf = %edi
+#; 12(%ebp) = long nframes = %ecx
+#; 16(%ebp) = float gain = st(0)
+
+ pushl %ebp
+ movl %esp, %ebp
+
+ #; save %edi
+ pushl %edi
+
+ #; the real function
+
+ #; if nframes == 0, go to end
+ movl 12(%ebp), %ecx #; nframes
+ cmp $0, %ecx
+ je .AG_END
+
+ #; create the gain buffer in %xmm1
+ movss 16(%ebp), %xmm1
+ shufps $0x00, %xmm1, %xmm1
+
+ #; Check for alignment
+
+ movl 8(%ebp), %edi #; buf
+ movl %edi, %edx #; buf => %edx
+ andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+ jz .AG_SSE #; if buffer IS aligned
+
+ #; PRE-LOOP
+ #; we iterate 1-3 times, doing normal x87 float comparison
+ #; so we reach a 16 byte aligned "buf" (=%edi) value
+
+.AGLP_START:
+
+ #; Load next value from the buffer
+ movss (%edi), %xmm0
+ mulss %xmm1, %xmm0
+ movss %xmm0, (%edi)
+
+ #; increment buffer, decrement counter
+ addl $4, %edi #; buf++;
+
+ decl %ecx #; nframes--
+ jz .AG_END #; if we run out of frames, we go to the end
+
+ addl $4, %edx #; one non-aligned byte less
+ cmp $16, %edx
+ jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
+
+.AG_SSE:
+
+ #; We have reached the 16 byte aligned "buf" ("edi") value
+
+ #; Figure out how many loops we should do
+ movl %ecx, %eax #; copy remaining nframes to %eax for division
+ movl $0, %edx #; 0 the edx register
+
+
+ pushl %edi
+ movl $4, %edi
+ divl %edi #; %edx = remainder == 0
+ popl %edi
+
+ #; %eax = SSE iterations
+ cmp $0, %eax
+ je .AGPOST_START
+
+
+.AGLP_SSE:
+
+ movaps (%edi), %xmm0
+ mulps %xmm1, %xmm0
+ movaps %xmm0, (%edi)
+
+ addl $16, %edi
+#; subl $4, %ecx #; nframes-=4
+
+ decl %eax
+ jnz .AGLP_SSE
+
+ #; Next we need to post-process all remaining frames
+ #; the remaining frame count is in %ecx
+
+ #; if no remaining frames, jump to the end
+#; cmp $0, %ecx
+ andl $3, %ecx #; nframes % 4
+ je .AG_END
+
+.AGPOST_START:
+
+ movss (%edi), %xmm0
+ mulss %xmm1, %xmm0
+ movss %xmm0, (%edi)
+
+ #; increment buffer, decrement counter
+ addl $4, %edi #; buf++;
+
+ decl %ecx #; nframes--
+ jnz .AGPOST_START #; if we run out of frames, we go to the end
+
+.AG_END:
+
+
+ popl %edi
+
+ #; return
+ leave
+ ret
+
+.size x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
+#; end proc
+
+
+
+#; float x86_sse_compute_peak(float *buf, long nframes, float current);
+
+.globl x86_sse_compute_peak
+ .type x86_sse_compute_peak,@function
+
+abs_mask:
+ .long 2147483647
+
+
+x86_sse_compute_peak:
+#; 8(%ebp) = float *buf = %edi
+#; 12(%ebp) = long nframes = %ecx
+#; 16(%ebp) = float current = st(0)
+
+ pushl %ebp
+ movl %esp, %ebp
+
+ #; save %edi
+ pushl %edi
+
+ #; the real function
+
+ #; Load "current" in xmm0
+ movss 16(%ebp), %xmm0
+
+ #; if nframes == 0, go to end
+ movl 12(%ebp), %ecx #; nframes
+ cmp $0, %ecx
+ je .CP_END
+
+ #; create the "abs" mask in %xmm2
+ movss abs_mask, %xmm2
+ shufps $0x00, %xmm2, %xmm2
+
+ #; Check for alignment
+
+ movl 8(%ebp), %edi #; buf
+ movl %edi, %edx #; buf => %edx
+ andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+ jz .CP_SSE #; if buffer IS aligned
+
+ #; PRE-LOOP
+ #; we iterate 1-3 times, doing normal x87 float comparison
+ #; so we reach a 16 byte aligned "buf" (=%edi) value
+
+.LP_START:
+
+ #; Load next value from the buffer
+ movss (%edi), %xmm1
+ andps %xmm2, %xmm1
+ maxss %xmm1, %xmm0
+
+ #; increment buffer, decrement counter
+ addl $4, %edi #; buf++;
+
+ decl %ecx #; nframes--
+ jz .CP_END #; if we run out of frames, we go to the end
+
+ addl $4, %edx #; one non-aligned byte less
+ cmp $16, %edx
+ jne .LP_START #; if more non-aligned frames exist, we do a do-over
+
+.CP_SSE:
+
+ #; We have reached the 16 byte aligned "buf" ("edi") value
+
+ #; Figure out how many loops we should do
+ movl %ecx, %eax #; copy remaining nframes to %eax for division
+
+ shr $2,%eax #; unsigned divide by 4
+ jz .POST_START
+
+ #; %eax = SSE iterations
+
+ #; current maximum is at %xmm0, but we need to ..
+ shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
+
+ #;prefetcht0 16(%edi)
+
+.LP_SSE:
+
+ movaps (%edi), %xmm1
+ andps %xmm2, %xmm1
+ maxps %xmm1, %xmm0
+
+ addl $16, %edi
+
+ decl %eax
+ jnz .LP_SSE
+
+ #; Calculate the maximum value contained in the 4 FP's in %xmm0
+ movaps %xmm0, %xmm1
+ shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
+ maxps %xmm1, %xmm0 #; maximums of the two pairs
+ movaps %xmm0, %xmm1
+ shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs
(1234 => 2143)
+ maxps %xmm1, %xmm0
+
+ #; now every float in %xmm0 is the same value, current maximum value
+
+ #; Next we need to post-process all remaining frames
+ #; the remaining frame count is in %ecx
+
+ #; if no remaining frames, jump to the end
+
+ andl $3, %ecx #; nframes % 4
+ jz .CP_END
+
+.POST_START:
+
+ movss (%edi), %xmm1
+ andps %xmm2, %xmm1
+ maxss %xmm1, %xmm0
+
+ addl $4, %edi #; buf++;
+
+ decl %ecx #; nframes--;
+ jnz .POST_START
+
+.CP_END:
+
+ #; Load the value from xmm0 to the float stack for returning
+ movss %xmm0, 16(%ebp)
+ flds 16(%ebp)
+
+ popl %edi
+
+ #; return
+ leave
+ ret
+
+.size x86_sse_compute_peak, .-x86_sse_compute_peak
+#; end proc
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
- [Traverso-commit] traverso CMakeLists.txt src/core/CMakeLists.txt...,
Remon Sijrier <=