libcvd-members
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Libcvd-members] libcvd/cvd_src utility.cc


From: Ethan Eade
Subject: [Libcvd-members] libcvd/cvd_src utility.cc
Date: Tue, 16 May 2006 13:15:22 +0000

CVSROOT:        /cvsroot/libcvd
Module name:    libcvd
Branch:         
Changes by:     Ethan Eade <address@hidden>     06/05/16 13:15:22

Modified files:
        cvd_src        : utility.cc 

Log message:
        Implementations of utility specialisations.  See utility.h

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/libcvd/libcvd/cvd_src/utility.cc.diff?tr1=1.1&tr2=1.2&r1=text&r2=text

Patches:
Index: libcvd/cvd_src/utility.cc
diff -u libcvd/cvd_src/utility.cc:1.1 libcvd/cvd_src/utility.cc:1.2
--- libcvd/cvd_src/utility.cc:1.1       Wed Dec  7 14:45:19 2005
+++ libcvd/cvd_src/utility.cc   Tue May 16 13:15:22 2006
@@ -3,152 +3,539 @@
 // internal functions used by CVD vision algorithm implementations
 #include <cvd/internal/assembly.h>
 
+#if CVD_HAVE_MMINTRIN
+#    include <mmintrin.h>
+#endif
+
+#if CVD_HAVE_XMMINTRIN
+#    include <xmmintrin.h>
+#endif
+
+#if CVD_HAVE_EMMINTRIN
+#    include <emmintrin.h>
+#endif
+
 using namespace std;
 
+namespace  // anonymous, to hide symbols
+{
+    using CVD::is_aligned;
+    using CVD::steps_to_align;
+    template <class F, class T1, class T2, int A, int M> inline void 
maybe_aligned_differences(const T1* a, const T1* b, T2* c, unsigned int count)
+    {
+       if (!is_aligned<A>(a)) {      
+           unsigned int steps = steps_to_align<A>(a);
+           F::unaligned_differences(a,b,c,steps);
+           count -= steps;
+           a += steps;
+           b += steps;
+           c += steps;
+       }
+       if (!is_aligned<A>(c)) {
+           F::unaligned_differences(a,b,c,count);
+           return;
+       }       
+       unsigned int block = (count/M)*M;
+       F::aligned_differences(a,b,c,block);
+       if (count > block)
+           F::unaligned_differences(a+block,b+block,c+block,count-block);
+    }    
+    
+    template <class F, class T1, class T2, int A, int M> inline void 
maybe_aligned_add_mul_add(const T1* a, const T1* b, const T1& c, T2* out, 
unsigned int count)
+    {
+       if (!is_aligned<A>(a)) {      
+           unsigned int steps = steps_to_align<A>(a);
+           F::unaligned_add_mul_add(a,b,c,out,steps);
+           count -= steps;
+           a += steps;
+           b += steps;
+           out += steps;
+       }
+       unsigned int block = (count/M)*M;
+       F::aligned_add_mul_add(a,b,c,out,block);
+       if (count > block)
+           F::unaligned_add_mul_add(a+block,b+block,c, out+block,count-block);
+    }    
+
+    template <class F, class T1, class T2, int A, int M> inline void 
maybe_aligned_assign_mul(const T1* a, const T1& c, T2* out, unsigned int count)
+    {
+       if (!is_aligned<A>(a)) {      
+           unsigned int steps = steps_to_align<A>(a);
+           F::unaligned_assign_mul(a,c,out,steps);
+           count -= steps;
+           a += steps;
+           out += steps;
+       }
+       unsigned int block = (count/M)*M;
+       F::aligned_assign_mul(a,c,out,block);
+       if (count > block)
+           F::unaligned_assign_mul(a+block,c, out+block,count-block);
+    }    
+
+    template <class F, class R, class T1, int A, int M> inline R 
maybe_aligned_inner_product(const T1* a, const T1* b, unsigned int count)
+    {
+       R sum = 0;
+       if (!is_aligned<A>(a)) {      
+           unsigned int steps = steps_to_align<A>(a);
+           sum = F::unaligned_inner_product(a,b,steps);
+           count -= steps;
+           a += steps;
+           b += steps;
+       }
+       unsigned int block = (count/M)*M;
+       sum += F::aligned_inner_product(a,b,block);
+       if (count > block)
+           sum += F::unaligned_inner_product(a+block,b+block,count-block);
+       return sum;
+    }    
+
+    template <class F, class R, class T1, int A, int M> inline R 
maybe_aligned_ssd(const T1* a, const T1* b, unsigned int count)
+    {
+       R sum = 0;
+       if (!is_aligned<A>(a)) {      
+           unsigned int steps = steps_to_align<A>(a);
+           sum = F::unaligned_ssd(a,b,steps);
+           count -= steps;
+           a += steps;
+           b += steps;
+       }
+       unsigned int block = (count/M)*M;
+       sum += F::aligned_ssd(a,b,block);
+       if (count > block)
+           sum += F::unaligned_ssd(a+block,b+block,count-block);
+       return sum;
+    }    
+
+}
+
 namespace CVD {
 
+#if defined(CVD_HAVE_MMXEXT) && defined(CVD_HAVE_MMINTRIN)
+
+
+    void byte_to_short_differences(const __m64* a, const __m64* b, __m64* 
diff, unsigned int count)
+    {
+       __m64 z = _mm_setzero_si64();
+       for (;count; --count, ++a, ++b, diff+=2) {
+           __m64 aq = *a;
+           __m64 bq = *b;
+           __m64 alo = _mm_unpacklo_pi8(aq,z);
+           __m64 ahi = _mm_unpackhi_pi8(aq,z);
+           __m64 blo = _mm_unpacklo_pi8(bq,z);
+           __m64 bhi = _mm_unpackhi_pi8(bq,z);
+           diff[0] = _mm_sub_pi16(alo,blo);
+           diff[1] = _mm_sub_pi16(ahi,bhi);
+       }
+       _mm_empty();
+    }
+
+    void short_differences(const __m64* a, const __m64* b, __m64* diff, 
unsigned int count)
+    {
+       while (count--) {
+           *(diff++) = _mm_sub_pi16(*(a++), *(b++));
+       }
+       _mm_empty();
+    }
+
+    
+    struct MMX_funcs {
+       template <class T1, class T2> static inline void 
unaligned_differences(const T1* a, const T1* b, T2* diff, size_t count) {
+           differences<T1,T2>(a,b,diff,count);
+       }
+       static inline void aligned_differences(const byte* a, const byte* b, 
short* diff, unsigned int count) {
+           if (is_aligned<8>(b))
+               byte_to_short_differences((const __m64*)a,(const __m64*)b, 
(__m64*)diff, count>>3);
+           else
+               unaligned_differences(a,b,diff,count);
+       }
+       
+       static inline void aligned_differences(const short* a, const short* b, 
short* diff, unsigned int count) {
+           if (is_aligned<8>(b))           
+               short_differences((const __m64*)a, (const __m64*)b, 
(__m64*)diff, count>>2);
+           else
+               unaligned_differences(a,b,diff,count);
+       }
+    };
 
-#if defined(CVD_HAVE_MMXEXT) && defined(CVD_HAVE_CPU_i686)
-  void differences(const unsigned char* a, const unsigned char* b, short* 
diff, unsigned int size)
-  {
-    if (!is_aligned<8>(a) || !is_aligned<8>(b) || !is_aligned<8>(diff)) {      
-      unsigned int steps = std::min(8 - ((size_t)a&0x7), size);
-      if (((size_t)(b+steps) & 0x7) || ((size_t)(diff+steps)&0x7)) {
-       differences<unsigned char, short>(a,b,diff,size);
-       return;
-      }
-      differences<unsigned char, short>(a,b,diff,steps);
-      if (steps == size)
-       return;
-      a += steps;
-      b += steps;
-      diff += steps;
-      size -= steps;
-    }
-    unsigned int block = size & (~0x7);
-    Internal::Assembly::byte_to_short_difference(a,b,diff,block);
-    if (size > block)
-      differences<unsigned char, short>(a+block,b+block,diff+block,size-block);
-  }
-#endif
-  
-
-#if defined(CVD_HAVE_MMXEXT) && defined(CVD_HAVE_CPU_i686)
-  void differences(const short* a, const short* b, short* diff, unsigned int 
size)
-  {
-    if (!is_aligned<8>(a) || !is_aligned<8>(b) || !is_aligned<8>(diff)) {
-      unsigned int steps = std::min(4 - ((size_t)a&0x7)/2, size);
-      if (((size_t)(b+steps) & 0x7) || ((size_t)(diff+steps)&0x7)) {
-       differences<short, short>(a,b,diff,size);
-       return;
-      }
-      differences<short, short>(a,b,diff,steps);
-      if (steps == size)
-       return;
-      a += steps;
-      b += steps;
-      diff += steps;
-      size -= steps;
-    }
-    unsigned int block = size & (~0x3);
-    Internal::Assembly::short_difference(a,b,diff,block);
-    if (size > block)
-      differences<short, short>(a+block,b+block,diff+block,size-block);
-  }
-#endif
-
-#if defined(CVD_HAVE_MMXEXT) && defined(CVD_HAVE_CPU_i686)
-  void differences(const int32_t* a, const int32_t* b, int32_t* diff, unsigned 
int size)
-  {
-    if (!is_aligned<8>(a) || !is_aligned<8>(b) || !is_aligned<8>(diff)) {
-      unsigned int steps = std::min(2 - ((size_t)a&0x7)/4, size);
-      if (((size_t)(b+steps) & 0x7) || ((size_t)(diff+steps)&0x7)) {
-       differences<int32_t, int32_t>(a,b,diff,size);
-       return;
-      }
-      differences<int32_t, int32_t>(a,b,diff,steps);
-      if (steps == size)
-       return;
-      a += steps;
-      b += steps;
-      diff += steps;
-      size -= steps;
-    }
-    unsigned int block = size & (~0x1);
-    Internal::Assembly::int_difference(a,b,diff,block);
-    if (size > block)
-      differences<int32_t, int32_t>(a+block,b+block,diff+block,size-block);
-  }
-#endif
-
-#if defined(CVD_HAVE_SSE) && defined(CVD_HAVE_CPU_i686)
-  void differences(const float* a, const float* b, float* diff, unsigned int 
size)
-  {
-    if (!is_aligned<16>(a) || !is_aligned<16>(b) || !is_aligned<16>(diff)) {
-      unsigned int steps = std::min(4 - ((size_t)a&0xF)/4, size);
-      if (((size_t)(b+steps) & 0xF) || ((size_t)(diff+steps)&0xF)) {
-       differences<float, float>(a,b,diff,size);
-       return;
-      }
-      differences<float, float>(a,b,diff,steps);
-      if (steps == size)
-       return;
-      a += steps;
-      b += steps;
-      diff += steps;
-      size -= steps;
-    }
-    unsigned int block = size & (~0x3);
-    Internal::Assembly::float_difference(a,b,diff,block);
-    if (size > block)
-      differences<float,float>(a+block,b+block,diff+block,size-block);
-  }
-#endif
-
-#if defined(CVD_HAVE_SSE) && defined(CVD_HAVE_CPU_i686)
-  void add_mul_add(const float* a, const float* b, const float& f, float* c, 
size_t size)
-  {
-    if (!is_aligned<16>(a)) {
-      size_t steps = std::min(4 - ((size_t)a&0xF)/4, size);
-      add_mul_add<float>(a,b,f,c,steps);
-      a += steps;
-      b += steps;
-      c += steps;
-      size -= steps;
-      if (size == 0)
-       return;
-    }
-    size_t block = size & (~0x3);
-    if (is_aligned<16>(a) && is_aligned<16>(b) && is_aligned<16>(c))
-      Internal::Assembly::float_add_mul_add(a,b,f,c,block);
-    else
-      Internal::Assembly::float_add_mul_add_unaligned(a,b,f,c,block);
-    if (size > block)
-      add_mul_add<float>(a+block,b+block,f,c+block,size-block);      
-  }
-#endif
-
-#if defined(CVD_HAVE_SSE) && defined(CVD_HAVE_CPU_i686)
-  void assign_mul(const float* in, const float& f, float* out, size_t size) {
-    if (!is_aligned<16>(in)) {
-      size_t steps = std::min(4 - ((size_t)in&0xF)/4, size);
-      assign_mul<float>(in,f,out,steps);
-      in += steps;
-      out += steps;
-      size -= steps;
-      if (size == 0)
-       return;
-    }
-    size_t block = size & (~0x3);
-    if (is_aligned<16>(in) && is_aligned<16>(out))
-      Internal::Assembly::float_assign_mul(in,f,out,block);
-    else {
-      assign_mul<float>(in,f,out,size);
-      return;
-    }
-    if (size > block)
-      assign_mul<float>(in+block,f,out+block,size-block);    
-  }
+    void differences(const byte* a, const byte* b, short* diff, unsigned int 
count) {
+       maybe_aligned_differences<MMX_funcs, byte, short, 8, 8>(a,b,diff,count);
+    }
+    
+    void differences(const short* a, const short* b, short* diff, unsigned int 
count) {
+       maybe_aligned_differences<MMX_funcs, short, short, 8, 
4>(a,b,diff,count);
+    }
 #endif
 
+
+#if defined(CVD_HAVE_SSE) && defined(CVD_HAVE_XMMINTRIN)
+
+    template <bool Aligned> inline __m128 load_ps(const void* addr) { return 
_mm_loadu_ps((const float*)addr); }
+    template <> inline __m128 load_ps<true>(const void* addr) { return 
_mm_load_ps((const float*)addr); }
+
+    template <bool Aligned_b> void float_differences(const __m128* a, const 
__m128* b, __m128* diff, unsigned int count)
+    {
+       while (count--) {
+           *(diff++) = _mm_sub_ps(*(a++), load_ps<Aligned_b>(b++));
+       }
+    }
+    
+    template <bool Aligned_b> void float_add_multiple_of_sum(const __m128* a, 
const __m128* b, const float& c, __m128* out, unsigned int count)
+    {
+       __m128 cccc = _mm_set1_ps(c);
+       while (count--) {
+           *out = _mm_add_ps(_mm_mul_ps(_mm_add_ps(*(a++), 
load_ps<Aligned_b>(b++)), cccc), *out);
+           ++out;
+       }
+    }
+
+    void float_assign_multiple(const __m128* a, const float& c, __m128* out, 
unsigned int count)
+    {
+       __m128 cccc = _mm_set1_ps(c);
+       while (count--) {
+           *out = _mm_mul_ps(*(a++), cccc);
+           ++out;
+       }
+    }
+    
+    template <bool Aligned_b> double float_inner_product(const __m128* a, 
const __m128* b, unsigned int count)
+    {
+       float sums_store[4];
+       const size_t BLOCK = 1<<10;
+       double dot = 0;
+       while (count) {
+           size_t pass = std::min(count, BLOCK);
+           count-=pass;
+           __m128 sums = _mm_setzero_ps();
+           while (pass--) {
+               __m128 prod = _mm_mul_ps(*(a++), load_ps<Aligned_b>(b++));
+               sums = _mm_add_ps(prod, sums);
+           }
+           _mm_storeu_ps(sums_store, sums);
+           dot += sums_store[0] + sums_store[1] + sums_store[2] + 
sums_store[3];
+       }
+       return dot;
+    }
+
+    template <bool Aligned_b> inline double 
float_sum_squared_differences(const __m128* a, const __m128* b, size_t count) 
+    {
+       float sums_store[4];
+       const size_t BLOCK = 1<<10;
+       double ssd = 0;
+       while (count) {
+           size_t pass = std::min(count, BLOCK);
+           count-=pass;
+           __m128 sums = _mm_setzero_ps();
+           while (pass--) {
+               __m128 diff = _mm_sub_ps(*(a++), load_ps<Aligned_b>(b++));
+               sums = _mm_add_ps(_mm_mul_ps(diff,diff), sums);
+           }
+           _mm_storeu_ps(sums_store, sums);
+           ssd += sums_store[0] + sums_store[1] + sums_store[2] + 
sums_store[3];
+       }
+       return ssd;
+    }
+
+    struct SSE_funcs {
+       template <class T1, class T2> static inline void 
unaligned_differences(const T1* a, const T1* b, T2* diff, size_t count) {
+           differences<T1,T2>(a,b,diff,count);
+       }
+       
+       static inline void aligned_differences(const float* a, const float* b, 
float* diff, unsigned int count) {
+           if (is_aligned<16>(b))
+               float_differences<true>((const __m128*)a, (const __m128*)b, 
(__m128*)diff, count>>2);
+           else
+               float_differences<false>((const __m128*)a, (const __m128*)b, 
(__m128*)diff, count>>2);
+       }
+
+       template <class T1, class T2> static inline void 
unaligned_add_mul_add(const T1* a, const T1* b, const T1& c, T2* out, size_t 
count) {
+           add_multiple_of_sum<T1,T2>(a,b,c,out,count);
+       }
+       static inline void aligned_add_mul_add(const float* a, const float* b, 
const float& c, float* out, size_t count) {
+           if (is_aligned<16>(b))
+               float_add_multiple_of_sum<true>((const __m128*)a, (const 
__m128*)b, c, (__m128*)out, count>>2);
+           else
+               float_add_multiple_of_sum<false>((const __m128*)a, (const 
__m128*)b, c, (__m128*)out, count>>2);
+       }       
+
+       template <class T1, class T2> static inline void 
unaligned_assign_mul(const T1* a, const T1& c, T2* out, size_t count) {
+           assign_multiple<T1,T2>(a,c,out,count);
+       }
+       static inline void aligned_assign_mul(const float* a, const float& c, 
float* out, size_t count) {
+           float_assign_multiple((const __m128*)a, c, (__m128*)out, count>>2);
+       }       
+
+       template <class T1> static inline double unaligned_inner_product(const 
T1* a, const T1* b, size_t count) {
+           return inner_product<T1>(a,b,count);
+       }
+       
+       static inline double aligned_inner_product(const float* a, const float* 
b, unsigned int count)
+       {
+           if (is_aligned<16>(b))
+               return float_inner_product<true>((const __m128*) a, (const 
__m128*) b, count>>2);
+           else
+               return float_inner_product<false>((const __m128*) a, (const 
__m128*) b, count>>2);
+       }       
+
+       template <class T1> static inline double unaligned_ssd(const T1* a, 
const T1* b, size_t count) {
+           return sum_squared_differences<T1>(a,b,count);
+       }
+       
+       static inline double aligned_ssd(const float* a, const float* b, 
unsigned int count)
+       {
+           if (is_aligned<16>(b))
+               return float_sum_squared_differences<true>((const __m128*) a, 
(const __m128*) b, count>>2);
+           else
+               return float_sum_squared_differences<false>((const __m128*) a, 
(const __m128*) b, count>>2);
+       }       
+    };
+    
+    void differences(const float* a, const float* b, float* diff, unsigned int 
size)
+    {
+       maybe_aligned_differences<SSE_funcs, float, float, 16, 
4>(a,b,diff,size);
+    }
+    
+    void add_multiple_of_sum(const float* a, const float* b, const float& c,  
float* out, unsigned int count)
+    {
+       maybe_aligned_add_mul_add<SSE_funcs,float,float,16,4>(a,b,c,out,count);
+    }
+    
+    void assign_multiple(const float* a, const float& c,  float* out, unsigned 
int count) 
+    {
+       maybe_aligned_assign_mul<SSE_funcs,float,float,16,4>(a,c,out,count);
+    }
+
+    
+    double inner_product(const float* a, const float* b, unsigned int count) 
+    {
+       return 
maybe_aligned_inner_product<SSE_funcs,double,float,16,4>(a,b,count);
+    }
+
+    double sum_squared_differences(const float* a, const float* b, size_t 
count)
+    {
+       return maybe_aligned_ssd<SSE_funcs,double,float,16,4>(a,b,count);
+    }
+
+#endif
+
+#if defined (CVD_HAVE_SSE2) && defined(CVD_HAVE_EMMINTRIN)
+
+
+    static inline __m128i zero_si128() { __m128i x; asm ( "pxor %0, %0  \n\t" 
: "=x"(x) ); return x; }
+    template <bool Aligned> inline __m128i load_si128(const void* addr) { 
return _mm_loadu_si128((const __m128i*)addr); }
+    template <> inline __m128i load_si128<true>(const void* addr) { return 
_mm_load_si128((const __m128i*)addr); }
+    template <bool Aligned> inline __m128d load_pd(const void* addr) { return 
_mm_loadu_pd((const double*)addr); }
+    template <> inline __m128d load_pd<true>(const void* addr) { return 
_mm_load_pd((const double*)addr); }
+
+
+    template <bool Aligned_b> void int_differences(const __m128i* a, const 
__m128i* b, __m128i* diff, unsigned int count)
+    {
+       while (count--) {
+           *(diff++) = _mm_sub_epi32(*(a++), load_si128<Aligned_b>(b++));
+       }
+    }
+    
+    template <bool Aligned_b> void double_differences(const __m128d* a, const 
__m128d* b, __m128d* diff, unsigned int count)
+    {
+       while (count--) {
+           *(diff++) = _mm_sub_pd(*(a++), load_pd<Aligned_b>(b++));
+       }
+    }
+
+    template <bool Aligned_b> void double_add_multiple_of_sum(const __m128d* 
a, const __m128d* b, const double& c, __m128d* out, unsigned int count)
+    {
+       __m128d cc = _mm_set1_pd(c);
+       while (count--) {
+           *out = _mm_add_pd(_mm_mul_pd(_mm_add_pd(*(a++), 
load_pd<Aligned_b>(b++)), cc), *out);
+           ++out;
+       }
+    }
+
+    void double_assign_multiple(const __m128d* a, const double& c, __m128d* 
out, unsigned int count)
+    {
+       __m128d cc = _mm_set1_pd(c);
+       while (count--) {
+           *out = _mm_mul_pd(*(a++), cc);
+           ++out;
+       }
+    }
+    template <bool Aligned_b> double double_inner_product(const __m128d* a, 
const __m128d* b, unsigned int count)
+    {
+       double sums_store[2];
+       const size_t BLOCK = 1<<16;
+       double dot = 0;
+       while (count) {
+           size_t pass = std::min(count, BLOCK);
+           count-=pass;
+           __m128d sums = _mm_setzero_pd();
+           while (pass--) {
+               __m128d prod = _mm_mul_pd(*(a++), load_pd<Aligned_b>(b++));
+               sums = _mm_add_pd(prod, sums);
+           }
+           _mm_storeu_pd(sums_store, sums);
+           dot += sums_store[0] + sums_store[1];
+       }
+       return dot;
+    }
+
+    template <bool Aligned_b> long long byte_sum_squared_differences(const 
__m128i* a, const __m128i* b, size_t count) {
+       unsigned long sums_store[4];    
+       const size_t BLOCK = 1<<15;
+       long long ssd = 0;
+       while (count) {
+           size_t pass = std::min(count, BLOCK);
+           count -= pass;
+           __m128i sums = _mm_setzero_si128();
+           while (pass--) {
+               __m128i lo_a = load_si128<true>(a++);
+               __m128i lo_b = load_si128<Aligned_b>(b++);
+               __m128i hi_a = _mm_unpackhi_epi8(lo_a, sums);
+               __m128i hi_b = _mm_unpackhi_epi8(lo_b, sums);
+               lo_a = _mm_unpacklo_epi8(lo_a, sums);
+               lo_b = _mm_unpacklo_epi8(lo_b, sums);
+               lo_a = _mm_sub_epi16(lo_a, lo_b);
+               hi_a = _mm_sub_epi16(hi_a, hi_b);
+               lo_a = _mm_madd_epi16(lo_a,lo_a);
+               hi_a = _mm_madd_epi16(hi_a,hi_a);
+               sums = _mm_add_epi32(_mm_add_epi32(lo_a, hi_a), sums);
+           }
+           _mm_storeu_si128((__m128i*)sums_store, sums);
+           ssd += sums_store[0] + sums_store[1] + sums_store[2] + 
sums_store[3];
+       }
+       return ssd;
+    }
+
+    template <bool Aligned_b> inline double 
double_sum_squared_differences(const __m128d* a, const __m128d* b, size_t 
count) 
+    {
+       double sums_store[2];
+       const size_t BLOCK = 1<<10;
+       double ssd = 0;
+       while (count) {
+           size_t pass = std::min(count, BLOCK);
+           count-=pass;
+           __m128d sums = _mm_setzero_pd();
+           while (pass--) {
+               __m128d diff = _mm_sub_pd(*(a++), load_pd<Aligned_b>(b++));
+               sums = _mm_add_pd(_mm_mul_pd(diff,diff), sums);
+           }
+           _mm_storeu_pd(sums_store, sums);
+           ssd += sums_store[0] + sums_store[1];
+       }
+       return ssd;
+    }
+
+    
+    struct SSE2_funcs {
+       template <class T1, class T2> static inline void 
unaligned_differences(const T1* a, const T1* b, T2* diff, size_t count) {
+           differences<T1,T2>(a,b,diff,count);
+       }
+
+       static inline void aligned_differences(const int32_t* a, const int32_t* 
b, int32_t* diff, unsigned int count) {
+           if (is_aligned<16>(b))
+               int_differences<true>((const __m128i*)a, (const __m128i*)b, 
(__m128i*)diff, count>>2);
+           else
+               int_differences<false>((const __m128i*)a, (const __m128i*)b, 
(__m128i*)diff, count>>2);
+       }
+
+       static inline void aligned_differences(const double* a, const double* 
b, double* diff, unsigned int count)
+       {
+           if (is_aligned<16>(b))
+               double_differences<true>((const __m128d*)a,(const 
__m128d*)b,(__m128d*)diff,count>>1);
+           else
+               double_differences<false>((const __m128d*)a,(const 
__m128d*)b,(__m128d*)diff,count>>1);
+       }
+
+       template <class T1, class T2> static inline void 
unaligned_add_mul_add(const T1* a, const T1* b, const T1& c, T2* out, size_t 
count) {
+           add_multiple_of_sum<T1,T2>(a,b,c,out,count);
+       }
+       
+       static inline void aligned_add_mul_add(const double* a, const double* 
b, const double& c, double* out, unsigned int count)
+       {
+           if (is_aligned<16>(b))
+               double_add_multiple_of_sum<true>((const __m128d*)a, (const 
__m128d*)b, c, (__m128d*)out, count>>1);
+           else
+               double_add_multiple_of_sum<false>((const __m128d*)a, (const 
__m128d*)b, c, (__m128d*)out, count>>1);
+       }
+       
+       template <class T1, class T2> static inline void 
unaligned_assign_mul(const T1* a, const T1& c, T2* out, size_t count) {
+           assign_multiple<T1,T2>(a,c,out,count);
+       }
+
+       static inline void aligned_assign_mul(const double* a, const double& c, 
double* out, unsigned int count)
+       {
+           double_assign_multiple((const __m128d*)a, c, (__m128d*)out, 
count>>1);
+       }
+       
+       template <class T1> static inline double unaligned_inner_product(const 
T1* a, const T1* b, size_t count) {
+           return inner_product<T1>(a,b,count);
+       }
+       
+       static inline double aligned_inner_product(const double* a, const 
double* b, unsigned int count)
+       {
+           if (is_aligned<16>(b))
+               return double_inner_product<true>((const __m128d*) a, (const 
__m128d*) b, count>>1);
+           else
+               return double_inner_product<false>((const __m128d*) a, (const 
__m128d*) b, count>>1);
+       }
+
+       template <class T1> static inline double unaligned_ssd(const T1* a, 
const T1* b, size_t count) {
+           return sum_squared_differences<T1>(a,b,count);
+       }
+       
+       static inline long long unaligned_ssd(const byte* a, const byte* b, 
size_t count) {
+           return SumSquaredDifferences<long long, int, 
byte>::sum_squared_differences(a,b,count);
+       }
+
+       static inline double aligned_ssd(const double* a, const double* b, 
size_t count) 
+       {
+           if (is_aligned<16>(b))
+               return double_sum_squared_differences<true>((const __m128d*)a, 
(const __m128d*)b, count>>1);
+           else
+               return double_sum_squared_differences<false>((const __m128d*)a, 
(const __m128d*)b, count>>1);
+       }
+
+       static inline long long aligned_ssd(const byte* a, const byte* b, 
size_t count) 
+       {
+           if (is_aligned<16>(b)) 
+               return byte_sum_squared_differences<true>((const __m128i*)a, 
(const __m128i*)b, count>>4);
+           else
+               return byte_sum_squared_differences<false>((const __m128i*)a, 
(const __m128i*)b, count>>4);
+       }       
+    };
+
+    void differences(const int32_t* a, const int32_t* b, int32_t* diff, 
unsigned int size)
+    {
+       maybe_aligned_differences<SSE2_funcs, int32_t, int32_t, 16, 
4>(a,b,diff,size);
+    }
+
+    void differences(const double* a, const double* b, double* diff, unsigned 
int size)
+    {
+       maybe_aligned_differences<SSE2_funcs, double, double, 16, 
2>(a,b,diff,size);
+    }
+
+    void add_multiple_of_sum(const double* a, const double* b, const double& 
c,  double* out, unsigned int count)
+    {
+       maybe_aligned_add_mul_add<SSE2_funcs, double, double, 16, 
2>(a,b,c,out,count);
+    }
+
+    void assign_multiple(const double* a, const double& c,  double* out, 
unsigned int count)
+    {
+       maybe_aligned_assign_mul<SSE2_funcs, double, double, 16, 
2>(a,c,out,count);
+    }
+
+    double inner_product(const double* a, const double* b, unsigned int count)
+    {
+       return maybe_aligned_inner_product<SSE2_funcs, double, double, 16, 
2>(a,b,count);
+    }
+
+    double sum_squared_differences(const double* a, const double* b, size_t 
count)
+    {
+       return maybe_aligned_ssd<SSE2_funcs, double, double, 16, 2>(a,b,count);
+    }
+
+    long long sum_squared_differences(const byte* a, const byte* b, size_t 
count)
+    {
+       return maybe_aligned_ssd<SSE2_funcs, long long, byte, 16, 
16>(a,b,count); 
+    }
+#endif
+    
 }




reply via email to

[Prev in Thread] Current Thread [Next in Thread]