[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Getfem-commits] [getfem-commits] branch master updated: Use hard coded
From: |
Konstantinos Poulios |
Subject: |
[Getfem-commits] [getfem-commits] branch master updated: Use hard coded loop unrollings |
Date: |
Tue, 05 Dec 2023 12:24:30 -0500 |
This is an automated email from the git hooks/post-receive script.
logari81 pushed a commit to branch master
in repository getfem.
The following commit(s) were added to refs/heads/master by this push:
new 0ce4bf39 Use hard coded loop unrollings
0ce4bf39 is described below
commit 0ce4bf3963d748964714a90851e6cd1dfc17a4b8
Author: Konstantinos Poulios <logari81@gmail.com>
AuthorDate: Tue Dec 5 18:24:02 2023 +0100
Use hard coded loop unrollings
- recursive template unrolling seems to be fragile at least with gcc
- changes do not introduce any significant slow down in test_assembly
benchmarks, when compiled with O2 optimizations
---
src/getfem_generic_assembly_compile_and_exec.cc | 401 +++++++++++++++++-------
1 file changed, 289 insertions(+), 112 deletions(-)
diff --git a/src/getfem_generic_assembly_compile_and_exec.cc
b/src/getfem_generic_assembly_compile_and_exec.cc
index 71793f6a..f51bcf80 100644
--- a/src/getfem_generic_assembly_compile_and_exec.cc
+++ b/src/getfem_generic_assembly_compile_and_exec.cc
@@ -25,7 +25,7 @@
#include "getfem/getfem_generic_assembly_compile_and_exec.h"
#include "getfem/getfem_generic_assembly_functions_and_operators.h"
-// #define GA_USES_BLAS // not so interesting, at least for debian blas
+//#define GA_USES_BLAS
// #define GA_DEBUG_INFO(a) { cout << a << endl; }
#define GA_DEBUG_INFO(a)
@@ -2515,43 +2515,46 @@ namespace getfem {
// Performs Ani Bmi -> Cmn
struct ga_instruction_contraction : public ga_instruction {
- base_tensor &t, &tc1, &tc2;
- size_type nn;
- virtual int exec() {
- GA_DEBUG_INFO("Instruction: contraction operation of size " << nn);
-#if GA_USES_BLAS
- long m = int(tc1.size()/nn), k = int(nn), n = int(tc2.size()/nn);
- long lda = m, ldb = n, ldc = m;
- char T = 'T', N = 'N';
- scalar_type alpha(1), beta(0);
- gmm::dgemm_(&N, &T, &m, &n, &k, &alpha, &(tc1[0]), &lda, &(tc2[0]), &ldb,
- &beta, &(t[0]), &ldc);
+ base_tensor &t;
+ const base_tensor &tc1, &tc2;
+ const size_type I;
+ virtual int exec() {
+ GA_DEBUG_INFO("Instruction: contraction operation of size " << I);
+#if defined(GA_USES_BLAS)
+ BLAS_INT N = BLAS_INT(tc1.size()/I), I_ = BLAS_INT(I),
+ M = BLAS_INT(tc2.size()/I);
+ char notrans = 'N', trans = 'T';
+ static const scalar_type one(1), zero(0);
+ gmm::dgemm_(¬rans, &trans, &M, &N, &I_, &one,
+ &(tc2[0]), &M, &(tc1[0]), &N, &zero, &(t[0]), &M);
#else
- size_type s1 = tc1.size()/nn, s2 = tc2.size()/nn;
- GA_DEBUG_ASSERT(t.size() == s1*s2, "Internal error");
+ size_type N = tc1.size()/I,
+ M = tc2.size()/I;
+ GA_DEBUG_ASSERT(t.size() == N*M, "Internal error");
- auto it1=tc1.begin(), it2=tc2.begin(), it2end=it2 + s2;
+ auto it1=tc1.begin(), it2=tc2.begin(), it2end=it2 + M;
for (auto it = t.begin(); it != t.end(); ++it) {
auto it11 = it1, it22 = it2;
scalar_type a = (*it11) * (*it22);
- for (size_type i = 1; i < nn; ++i)
- { it11 += s1; it22 += s2; a += (*it11) * (*it22); }
+ for (size_type i = 1; i < I; ++i)
+ { it11 += N; it22 += M; a += (*it11) * (*it22); }
*it = a;
++it2; if (it2 == it2end) { it2 = tc2.begin(), ++it1; }
}
// auto it = t.begin(); // Unoptimized version.
- // for (size_type i = 0; i < s1; ++i)
- // for (size_type j = 0; j < s2; ++j, ++it) {
+ // for (size_type n = 0; n < N; ++n)
+ // for (size_type m = 0; m < M; ++m, ++it) {
// *it = scalar_type(0);
- // for (size_type k = 0; k < nn; ++k)
- // *it += tc1[i+k*s1] * tc2[j+k*s2];
+ // for (size_type i = 0; i < I; ++i)
+ // *it += tc1[n+N*i] * tc2[m+M*i];
// }
#endif
return 0;
}
- ga_instruction_contraction(base_tensor &t_, base_tensor &tc1_,
- base_tensor &tc2_, size_type n_)
- : t(t_), tc1(tc1_), tc2(tc2_), nn(n_) {}
+ ga_instruction_contraction(base_tensor &t_,
+ const base_tensor &tc1_,
+ const base_tensor &tc2_, size_type I_)
+ : t(t_), tc1(tc1_), tc2(tc2_), I(I_) {}
};
// Performs Ani Bmi -> Cmn
@@ -2850,29 +2853,116 @@ namespace getfem {
- template<int N> inline scalar_type reduc_elem_unrolled__
- (base_tensor::iterator &it1, base_tensor::iterator &it2,
- size_type s1, size_type s2) {
- return (it1[(N-1)*s1])*(it2[(N-1)*s2])
- + reduc_elem_unrolled__<N-1>(it1, it2, s1, s2);
+ template<int I> inline
+ void reduc_elem_unrolled__(base_tensor::iterator &it,
+ base_tensor::const_iterator &it1,
base_tensor::const_iterator &it2,
+ size_type s1, size_type s2) {
+ *it = (*it1)*(*it2);
+ for (int i=1; i < I; ++i)
+ *it += (*(it1+i*s1)) * (*(it2+i*s2));
+ }
+ template<> inline
+ void reduc_elem_unrolled__<9>(base_tensor::iterator &it,
+ base_tensor::const_iterator &it1,
base_tensor::const_iterator &it2,
+ size_type s1, size_type s2) {
+ *it = (*it1) * (*it2);
+ *it += (*(it1+s1)) * (*(it2+s2));
+ *it += (*(it1+2*s1)) * (*(it2+2*s2));
+ *it += (*(it1+3*s1)) * (*(it2+3*s2));
+ *it += (*(it1+4*s1)) * (*(it2+4*s2));
+ *it += (*(it1+5*s1)) * (*(it2+5*s2));
+ *it += (*(it1+6*s1)) * (*(it2+6*s2));
+ *it += (*(it1+7*s1)) * (*(it2+7*s2));
+ *it += (*(it1+8*s1)) * (*(it2+8*s2));
+ }
+ template<> inline
+ void reduc_elem_unrolled__<8>(base_tensor::iterator &it,
+ base_tensor::const_iterator &it1,
base_tensor::const_iterator &it2,
+ size_type s1, size_type s2) {
+ *it = (*it1) * (*it2);
+ *it += (*(it1+s1)) * (*(it2+s2));
+ *it += (*(it1+2*s1)) * (*(it2+2*s2));
+ *it += (*(it1+3*s1)) * (*(it2+3*s2));
+ *it += (*(it1+4*s1)) * (*(it2+4*s2));
+ *it += (*(it1+5*s1)) * (*(it2+5*s2));
+ *it += (*(it1+6*s1)) * (*(it2+6*s2));
+ *it += (*(it1+7*s1)) * (*(it2+7*s2));
+ }
+ template<> inline
+ void reduc_elem_unrolled__<7>(base_tensor::iterator &it,
+ base_tensor::const_iterator &it1,
base_tensor::const_iterator &it2,
+ size_type s1, size_type s2) {
+ *it = (*it1) * (*it2);
+ *it += (*(it1+s1)) * (*(it2+s2));
+ *it += (*(it1+2*s1)) * (*(it2+2*s2));
+ *it += (*(it1+3*s1)) * (*(it2+3*s2));
+ *it += (*(it1+4*s1)) * (*(it2+4*s2));
+ *it += (*(it1+5*s1)) * (*(it2+5*s2));
+ *it += (*(it1+6*s1)) * (*(it2+6*s2));
+ }
+ template<> inline
+ void reduc_elem_unrolled__<6>(base_tensor::iterator &it,
+ base_tensor::const_iterator &it1,
base_tensor::const_iterator &it2,
+ size_type s1, size_type s2) {
+ *it = (*it1) * (*it2);
+ *it += (*(it1+s1)) * (*(it2+s2));
+ *it += (*(it1+2*s1)) * (*(it2+2*s2));
+ *it += (*(it1+3*s1)) * (*(it2+3*s2));
+ *it += (*(it1+4*s1)) * (*(it2+4*s2));
+ *it += (*(it1+5*s1)) * (*(it2+5*s2));
}
- template<> inline scalar_type reduc_elem_unrolled__<1>
- (base_tensor::iterator &it1, base_tensor::iterator &it2,
- size_type /*s1*/, size_type /*s2*/)
- { return (*it1)*(*it2); }
+ template<> inline
+ void reduc_elem_unrolled__<5>(base_tensor::iterator &it,
+ base_tensor::const_iterator &it1,
base_tensor::const_iterator &it2,
+ size_type s1, size_type s2) {
+ *it = (*it1) * (*it2);
+ *it += (*(it1+s1)) * (*(it2+s2));
+ *it += (*(it1+2*s1)) * (*(it2+2*s2));
+ *it += (*(it1+3*s1)) * (*(it2+3*s2));
+ *it += (*(it1+4*s1)) * (*(it2+4*s2));
+ }
+ template<> inline
+ void reduc_elem_unrolled__<4>(base_tensor::iterator &it,
+ base_tensor::const_iterator &it1,
base_tensor::const_iterator &it2,
+ size_type s1, size_type s2) {
+ *it = (*it1) * (*it2);
+ *it += (*(it1+s1)) * (*(it2+s2));
+ *it += (*(it1+2*s1)) * (*(it2+2*s2));
+ *it += (*(it1+3*s1)) * (*(it2+3*s2));
+ }
+ template<> inline
+ void reduc_elem_unrolled__<3>(base_tensor::iterator &it,
+ base_tensor::const_iterator &it1,
base_tensor::const_iterator &it2,
+ size_type s1, size_type s2) {
+ *it = (*it1) * (*it2);
+ *it += (*(it1+s1)) * (*(it2+s2));
+ *it += (*(it1+2*s1)) * (*(it2+2*s2));
+ }
+ template<> inline
+ void reduc_elem_unrolled__<2>(base_tensor::iterator &it,
+ base_tensor::const_iterator &it1,
base_tensor::const_iterator &it2,
+ size_type s1, size_type s2) {
+ *it = (*it1) * (*it2);
+ *it += (*(it1+s1)) * (*(it2+s2));
+ }
+ template<> inline
+ void reduc_elem_unrolled__<1>(base_tensor::iterator &it,
+ base_tensor::const_iterator &it1,
base_tensor::const_iterator &it2,
+ size_type /*s1*/, size_type /*s2*/)
+ { *it = (*it1)*(*it2); }
// Performs Ani Bmi -> Cmn. Unrolled operation.
- template<int N> struct ga_instruction_contraction_unrolled
+ template<int I> struct ga_instruction_contraction_unrolled
: public ga_instruction {
base_tensor &t, &tc1, &tc2;
virtual int exec() {
- GA_DEBUG_INFO("Instruction: unrolled contraction operation of size " <<
N);
- size_type s1 = tc1.size()/N, s2 = tc2.size()/N;
- GA_DEBUG_ASSERT(t.size() == s1*s2, "Internal error, " << t.size()
- << " != " << s1 << "*" << s2);
- base_tensor::iterator it1=tc1.begin(), it2=tc2.begin(), it2end=it2 + s2;
+ GA_DEBUG_INFO("Instruction: unrolled contraction operation of size " <<
I);
+ size_type N = tc1.size()/I, M = tc2.size()/I;
+ GA_DEBUG_ASSERT(t.size() == N*M, "Internal error, " << t.size()
+ << " != " << N << "*" << M);
+ base_tensor::const_iterator it1=tc1.cbegin(), it2=tc2.cbegin(),
it2end=it2+M;
for (base_tensor::iterator it = t.begin(); it != t.end(); ++it) {
- *it = reduc_elem_unrolled__<N>(it1, it2, s1, s2);
+ reduc_elem_unrolled__<I>(it, it1, it2, N, M);
++it2; if (it2 == it2end) { it2 = tc2.begin(), ++it1; }
}
return 0;
@@ -2883,62 +2973,63 @@ namespace getfem {
};
template<int N, int S2> inline void reduc_elem_d_unrolled__
- (base_tensor::iterator &it, base_tensor::iterator &it1,
- base_tensor::iterator &it2, size_type s1, size_type s2) {
- *it++ = reduc_elem_unrolled__<N>(it1, it2, s1, s2);
- reduc_elem_d_unrolled__<N, S2-1>(it, it1, ++it2, s1, s2);
+ (base_tensor::iterator &it,
+ base_tensor::const_iterator &it1, base_tensor::const_iterator &it2,
+ size_type s1, size_type s2) {
+ reduc_elem_unrolled__<N>(it, it1, it2, s1, s2);
+ reduc_elem_d_unrolled__<N, S2-1>(++it, it1, ++it2, s1, s2);
}
// A Repeated definition is following because partial specialization
// of functions is not allowed in C++ for the moment.
// The gain in assembly time is small compared to the simply unrolled version
template<> inline void reduc_elem_d_unrolled__<1, 0>
- (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
- base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) {
}
+ (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+ base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2
*/) { }
template<> inline void reduc_elem_d_unrolled__<2, 0>
- (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
- base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) {
}
+ (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+ base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2
*/) { }
template<> inline void reduc_elem_d_unrolled__<3, 0>
- (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
- base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) {
}
+ (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+ base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2
*/) { }
template<> inline void reduc_elem_d_unrolled__<4, 0>
- (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
- base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) {
}
+ (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+ base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2
*/) { }
template<> inline void reduc_elem_d_unrolled__<5, 0>
- (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
- base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) {
}
+ (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+ base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2
*/) { }
template<> inline void reduc_elem_d_unrolled__<6, 0>
- (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
- base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) {
}
+ (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+ base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2
*/) { }
template<> inline void reduc_elem_d_unrolled__<7, 0>
- (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
- base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) {
}
+ (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+ base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2
*/) { }
template<> inline void reduc_elem_d_unrolled__<8, 0>
- (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
- base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) {
}
+ (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+ base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2
*/) { }
template<> inline void reduc_elem_d_unrolled__<9, 0>
- (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
- base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) {
}
+ (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+ base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2
*/) { }
template<> inline void reduc_elem_d_unrolled__<10, 0>
- (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
- base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) {
}
+ (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+ base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2
*/) { }
template<> inline void reduc_elem_d_unrolled__<11, 0>
- (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
- base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) {
}
+ (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+ base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2
*/) { }
template<> inline void reduc_elem_d_unrolled__<12, 0>
- (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
- base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) {
}
+ (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+ base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2
*/) { }
template<> inline void reduc_elem_d_unrolled__<13, 0>
- (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
- base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) {
}
+ (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+ base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2
*/) { }
template<> inline void reduc_elem_d_unrolled__<14, 0>
- (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
- base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) {
}
+ (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+ base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2
*/) { }
template<> inline void reduc_elem_d_unrolled__<15, 0>
- (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
- base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) {
}
+ (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+ base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2
*/) { }
template<> inline void reduc_elem_d_unrolled__<16, 0>
- (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
- base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) {
}
+ (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+ base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2
*/) { }
// Performs Ani Bmi -> Cmn. Automatically doubly unrolled operation
// (for uniform meshes).
@@ -2952,9 +3043,10 @@ namespace getfem {
GA_DEBUG_ASSERT(s2 == S2, "Internal error");
GA_DEBUG_ASSERT(t.size() == s1*s2, "Internal error, " << t.size()
<< " != " << s1 << "*" << s2);
- base_tensor::iterator it = t.begin(), it1 = tc1.begin();
+ base_tensor::iterator it = t.begin();
+ base_tensor::const_iterator it1 = tc1.cbegin();
for (size_type ii = 0; ii < s1; ++ii, ++it1) {
- base_tensor::iterator it2 = tc2.begin();
+ base_tensor::const_iterator it2 = tc2.cbegin();
reduc_elem_d_unrolled__<N, S2>(it, it1, it2, s1, s2);
}
GA_DEBUG_ASSERT(it == t.end(), "Internal error");
@@ -3141,6 +3233,7 @@ namespace getfem {
}
switch(n) {
+ case 1 : GMM_ASSERT1(false, "n=1 should not happen");
case 2 : return std::make_shared<ga_instruction_contraction_unrolled< 2>>
(t, tc1, tc2);
case 3 : return std::make_shared<ga_instruction_contraction_unrolled< 3>>
@@ -3498,34 +3591,114 @@ namespace getfem {
: t(t_), tc1(tc1_), tc2(tc2_) {}
};
- template<int S1> inline void tmult_elem_unrolled__
- (base_tensor::iterator &it, base_tensor::iterator &it1,
- base_tensor::iterator &it2) {
- *it++ = (*it1++)*(*it2);
- tmult_elem_unrolled__<S1-1>(it, it1, it2);
+ template<int I> inline void dax__(base_tensor::iterator &it,
+ base_tensor::const_iterator &itx,
+ const scalar_type &a) {
+ constexpr int I1 = I/8;
+ constexpr int I2 = I - I1*8;
+ for (int i=0; i < I1; ++i)
+ dax__<8>(it, itx , a);
+ dax__<I2>(it, itx , a);
+ }
+ template<> inline void dax__<8>(base_tensor::iterator &it,
+ base_tensor::const_iterator &itx,
+ const scalar_type &a) {
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
}
- template<> inline void tmult_elem_unrolled__<0>
- (base_tensor::iterator &/*it*/, base_tensor::iterator &/*it1*/,
- base_tensor::iterator &/*it2*/) { }
+ template<> inline void dax__<7>(base_tensor::iterator &it,
+ base_tensor::const_iterator &itx,
+ const scalar_type &a) {
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ }
+ template<> inline void dax__<6>(base_tensor::iterator &it,
+ base_tensor::const_iterator &itx,
+ const scalar_type &a) {
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ }
+ template<> inline void dax__<5>(base_tensor::iterator &it,
+ base_tensor::const_iterator &itx,
+ const scalar_type &a) {
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ }
+ template<> inline void dax__<4>(base_tensor::iterator &it,
+ base_tensor::const_iterator &itx,
+ const scalar_type &a) {
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ }
+ template<> inline void dax__<3>(base_tensor::iterator &it,
+ base_tensor::const_iterator &itx,
+ const scalar_type &a) {
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ }
+ template<> inline void dax__<2>(base_tensor::iterator &it,
+ base_tensor::const_iterator &itx,
+ const scalar_type &a) {
+ *it++ = *itx++ * a;
+ *it++ = *itx++ * a;
+ }
+ template<> inline void dax__<1>(base_tensor::iterator &it,
+ base_tensor::const_iterator &itx,
+ const scalar_type &a) {
+ *it++ = *itx++ * a;
+ }
+ template<> inline void dax__<0>(base_tensor::iterator &,
+ base_tensor::const_iterator &,
+ const scalar_type &) {}
// Performs Aij Bkl -> Cijkl, partially unrolled version
- template<int S1> struct ga_instruction_simple_tmult_unrolled
+ template<int IJ> struct ga_instruction_simple_tmult_unrolled
: public ga_instruction {
- base_tensor &t, &tc1, &tc2;
+ base_tensor &t;
+ const base_tensor &tc1, &tc2;
virtual int exec() {
- size_type s2 = tc2.size();
- GA_DEBUG_ASSERT(tc1.size() == S1,
- "Wrong sizes " << tc1.size() << " != " << S1);
+ size_type KL = tc2.size();
+ GA_DEBUG_ASSERT(tc1.size() == IJ,
+ "Wrong sizes " << tc1.size() << " != " << IJ);
GA_DEBUG_INFO("Instruction: simple tensor product, unrolled with "
- << S1 << " operations");
- GA_DEBUG_ASSERT(t.size() == S1 * s2,
- "Wrong sizes " << t.size() << " != " << S1 << "*" << s2);
- base_tensor::iterator it = t.begin(), it2 = tc2.begin();
- for (size_type ii = 0; ii < s2; ++ii, ++it2) {
- base_tensor::iterator it1 = tc1.begin();
- tmult_elem_unrolled__<S1>(it, it1, it2);
+ << IJ << " operations");
+ GA_DEBUG_ASSERT(t.size() == IJ * KL,
+ "Wrong sizes " << t.size() << " != " << IJ << "*" << KL);
+#if 0 // too slow, how can this be? that's what dger should be good at. (it is
slower even without the std::fill overhead)
+ const BLAS_INT IJ_=BLAS_INT(IJ), KL_=BLAS_INT(KL), INC(1);
+ const scalar_type one(1);
+ std::fill(t.begin(), t.end(), scalar_type(0));
+ gmm::dger_(&IJ_, &KL_, &one, &tc1[0], &INC, &tc2[0], &INC, &(t[0]),
&IJ_);
+#else
+ base_tensor::iterator it = t.begin();
+ base_tensor::const_iterator it2 = tc2.cbegin();
+ for (size_type kl = 0; kl < KL; ++kl, ++it2) {
+ base_tensor::const_iterator it1 = tc1.cbegin();
+ dax__<IJ>(it, it1, *it2);
}
GA_DEBUG_ASSERT(it == t.end(), "Internal error");
+#endif
return 0;
}
ga_instruction_simple_tmult_unrolled(base_tensor &t_, base_tensor &tc1_,
@@ -3536,6 +3709,7 @@ namespace getfem {
pga_instruction ga_uniform_instruction_simple_tmult
(base_tensor &t, base_tensor &tc1, base_tensor &tc2) {
switch(tc1.size()) {
+ case 1 : GMM_ASSERT1(false, "size 1 should not happen");
case 2 : return std::make_shared<ga_instruction_simple_tmult_unrolled< 2>>
(t, tc1, tc2);
case 3 : return std::make_shared<ga_instruction_simple_tmult_unrolled< 3>>
@@ -3574,27 +3748,30 @@ namespace getfem {
// Performs Ami Bnj -> Cmnij. To be optimized.
struct ga_instruction_spec_tmult : public ga_instruction {
- base_tensor &t, &tc1, &tc2;
- size_type s1_2, s2_2;
+ base_tensor &t;
+ const base_tensor &tc1, &tc2;
+ const size_type I, J;
virtual int exec() {
GA_DEBUG_INFO("Instruction: specific tensor product");
GA_DEBUG_ASSERT(t.size() == tc1.size() * tc2.size(), "Wrong sizes");
- size_type s1_1 = tc1.size() / s1_2;
- size_type s2_1 = tc2.size() / s2_2;
-
+ const size_type M = tc1.size() / I,
+ N = tc2.size() / J;
base_tensor::iterator it = t.begin();
- for (size_type j = 0; j < s2_2; ++j)
- for (size_type i = 0; i < s1_2; ++i)
- for (size_type n = 0; n < s2_1; ++n)
- for (size_type m = 0; m < s1_1; ++m, ++it)
- *it = tc1[m+i*s1_1] * tc2[n+j*s2_1];
+ for (size_type j = 0; j < J; ++j)
+ for (size_type i = 0; i < I; ++i)
+ for (size_type n = 0; n < N; ++n) {
+ scalar_type val = tc2[n+N*j];
+ for (size_type m = 0; m < M; ++m, ++it)
+ *it = tc1[m+M*i] * val;
+ }
GA_DEBUG_ASSERT(it == t.end(), "Wrong sizes");
return 0;
}
- ga_instruction_spec_tmult(base_tensor &t_, base_tensor &tc1_,
- base_tensor &tc2_, size_type s1_2_,
- size_type s2_2_)
- : t(t_), tc1(tc1_), tc2(tc2_), s1_2(s1_2_), s2_2(s2_2_) {}
+ ga_instruction_spec_tmult(base_tensor &t_,
+ const base_tensor &tc1_,
+ const base_tensor &tc2_,
+ size_type I_, size_type J_)
+ : t(t_), tc1(tc1_), tc2(tc2_), I(I_), J(J_) {}
};
// Performs Ai Bmj -> Cmij. To be optimized.
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Getfem-commits] [getfem-commits] branch master updated: Use hard coded loop unrollings,
Konstantinos Poulios <=