commit-gnuradio
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Commit-gnuradio] [gnuradio] 16/22: volk: add neon kernel for 64u_bytesw


From: git
Subject: [Commit-gnuradio] [gnuradio] 16/22: volk: add neon kernel for 64u_byteswap and puppets for 64/16 byteswap
Date: Fri, 31 Oct 2014 19:22:31 +0000 (UTC)

This is an automated email from the git hooks/post-receive script.

jcorgan pushed a commit to branch master
in repository gnuradio.

commit 2c4c371885c31222362f70a1cd714415d1398021
Author: Nathan West <address@hidden>
Date:   Wed Oct 22 14:09:44 2014 -0500

    volk: add neon kernel for 64u_byteswap and puppets for 64/16 byteswap
---
 volk/apps/volk_profile.cc                       |  6 +--
 volk/kernels/volk/volk_16u_byteswappuppet_16u.h | 46 +++++++++++++++++++++
 volk/kernels/volk/volk_64u_byteswap.h           | 55 +++++++++++++++++++++++++
 volk/kernels/volk/volk_64u_byteswappuppet_64u.h | 45 ++++++++++++++++++++
 4 files changed, 149 insertions(+), 3 deletions(-)

diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index b1f10f2..4167f4d 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -157,7 +157,7 @@ int main(int argc, char *argv[]) {
     //VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204602, 10000, 
&results, benchmark_mode, kernel_regex);
     //VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 10000, 
&results, benchmark_mode, kernel_regex);
     //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, 
&results, benchmark_mode, kernel_regex);
-    VOLK_PROFILE(volk_16u_byteswap, 0, 0, 204602, 10000, &results, 
benchmark_mode, kernel_regex);
+    VOLK_PUPPET_PROFILE(volk_16u_byteswappuppet_16u, volk_16u_byteswap, 0, 0, 
204602, 10000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_16i_32fc_dot_prod_32fc, 1e-4, 0, 204602, 10000, 
&results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, 
benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results, 
benchmark_mode, kernel_regex);
@@ -175,7 +175,7 @@ int main(int argc, char *argv[]) {
     VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 
204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 204602, 100, &results, 
benchmark_mode, kernel_regex);
     //VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 2046, 10000, 
&results, benchmark_mode, kernel_regex);
-    VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 204602, 10000, 
&results, benchmark_mode, kernel_regex);
+    VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 204602, 100, 
&results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32fc_deinterleave_32f_x2, 1e-4, 0, 204602, 1000, 
&results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32fc_deinterleave_64f_x2, 1e-4, 0, 204602, 1000, 
&results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32fc_s32f_deinterleave_real_16i, 0, 32768, 204602, 
10000, &results, benchmark_mode, kernel_regex);
@@ -224,7 +224,7 @@ int main(int argc, char *argv[]) {
     VOLK_PROFILE(volk_64f_convert_32f, 1e-4, 0, 204602, 10000, &results, 
benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_64f_x2_max_64f, 1e-4, 0, 204602, 1000, &results, 
benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_64f_x2_min_64f, 1e-4, 0, 204602, 1000, &results, 
benchmark_mode, kernel_regex);
-    VOLK_PROFILE(volk_64u_byteswap, 0, 0, 204602, 1000, &results, 
benchmark_mode, kernel_regex);
+    VOLK_PUPPET_PROFILE(volk_64u_byteswappuppet_64u, volk_64u_byteswap, 0, 0, 
204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PUPPET_PROFILE(volk_64u_popcntpuppet_64u, volk_64u_popcnt, 0, 0, 
2046, 10000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_8ic_deinterleave_16i_x2, 0, 0, 204602, 3000, &results, 
benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 204602, 3000, 
&results, benchmark_mode, kernel_regex);
diff --git a/volk/kernels/volk/volk_16u_byteswappuppet_16u.h 
b/volk/kernels/volk/volk_16u_byteswappuppet_16u.h
new file mode 100644
index 0000000..699a758
--- /dev/null
+++ b/volk/kernels/volk/volk_16u_byteswappuppet_16u.h
@@ -0,0 +1,46 @@
+#ifndef INCLUDED_volk_16u_byteswappuppet_16u_H
+#define INCLUDED_volk_16u_byteswappuppet_16u_H
+
+
+#include <stdint.h>
+#include <volk/volk_16u_byteswap.h>
+#include <string.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_16u_byteswappuppet_16u_generic(uint16_t*output, 
uint16_t* intsToSwap, unsigned int num_points){
+
+    volk_16u_byteswap_generic((uint16_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_NEON
+static inline void volk_16u_byteswappuppet_16u_neon(uint16_t*output, uint16_t* 
intsToSwap, unsigned int num_points){
+
+    volk_16u_byteswap_neon((uint16_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_16u_byteswappuppet_16u_u_sse2(uint16_t* intsToSwap, 
unsigned int num_points){
+
+    volk_16u_byteswap_u_sse2((uint16_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_16u_byteswappuppet_16u_a_sse2(uint16_t* intsToSwap, 
unsigned int num_points){
+
+    volk_16u_byteswap_a_sse2((uint16_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+
+}
+#endif
+
+
+#endif
diff --git a/volk/kernels/volk/volk_64u_byteswap.h 
b/volk/kernels/volk/volk_64u_byteswap.h
index df71f0e..dce8832 100644
--- a/volk/kernels/volk/volk_64u_byteswap.h
+++ b/volk/kernels/volk/volk_64u_byteswap.h
@@ -104,7 +104,62 @@ static inline void volk_64u_byteswap_generic(uint64_t* 
intsToSwap, unsigned int
 }
 #endif /* LV_HAVE_GENERIC */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+  \brief Byteswaps (in-place) a vector of int64_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int 
num_points){
+  uint32_t* inputPtr = (uint32_t*)intsToSwap;
+  unsigned int number = 0;
+  unsigned int n8points = num_points / 4;
+
+  uint8x8x4_t input_table;
+  uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+  uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+
+  /* these magic numbers are used as byte-indeces in the LUT.
+     they are pre-computed to save time. A simple C program
+     can calculate them; for example for lookup01:
+    uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+    for(ii=0; ii < 8; ++ii) {
+        index += ((uint64_t)(*(chars+ii))) << (ii*8);
+    }
+  */
+  int_lookup01 = vcreate_u8(2269495096316185);
+  int_lookup23 = vcreate_u8(146949840772469531);
+  int_lookup45 = vcreate_u8(291630186448622877);
+  int_lookup67 = vcreate_u8(436310532124776223);
+
+  for(number = 0; number < n8points; ++number){
+    input_table = vld4_u8((uint8_t*) inputPtr);
+    swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+    swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+    swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+    swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+    vst1_u8((uint8_t*) inputPtr, swapped_int01);
+    vst1_u8((uint8_t*) (inputPtr+2), swapped_int23);
+    vst1_u8((uint8_t*) (inputPtr+4), swapped_int45);
+    vst1_u8((uint8_t*) (inputPtr+6), swapped_int67);
+
+    inputPtr += 4;
+  }
+
+  for(number = n8points * 4; number < num_points; ++number){
+    uint32_t output1 = *inputPtr;
+    uint32_t output2 = inputPtr[1];
 
+    output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | 
((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+    output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | 
((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+    *inputPtr++ = output2;
+    *inputPtr++ = output1;
+  }
+
+}
+#endif /* LV_HAVE_NEON */
 
 
 #endif /* INCLUDED_volk_64u_byteswap_u_H */
diff --git a/volk/kernels/volk/volk_64u_byteswappuppet_64u.h 
b/volk/kernels/volk/volk_64u_byteswappuppet_64u.h
new file mode 100644
index 0000000..591b223
--- /dev/null
+++ b/volk/kernels/volk/volk_64u_byteswappuppet_64u.h
@@ -0,0 +1,45 @@
+#ifndef INCLUDED_volk_64u_byteswappuppet_64u_H
+#define INCLUDED_volk_64u_byteswappuppet_64u_H
+
+
+#include <stdint.h>
+#include <volk/volk_64u_byteswap.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_64u_byteswappuppet_64u_generic(uint64_t*output, 
uint64_t* intsToSwap, unsigned int num_points){
+
+    volk_64u_byteswap_generic((uint64_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_NEON
+static inline void volk_64u_byteswappuppet_64u_neon(uint64_t*output, uint64_t* 
intsToSwap, unsigned int num_points){
+
+    volk_64u_byteswap_neon((uint64_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_64u_byteswappuppet_64u_u_sse2(uint64_t* intsToSwap, 
unsigned int num_points){
+
+    volk_64u_byteswap_u_sse2((uint64_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_64u_byteswappuppet_64u_a_sse2(uint64_t* intsToSwap, 
unsigned int num_points){
+
+    volk_64u_byteswap_a_sse2((uint64_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+
+}
+#endif
+
+
+#endif



reply via email to

[Prev in Thread] Current Thread [Next in Thread]