[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Getfem-commits] r5432 - in /trunk/getfem: ./ contrib/opt_assembly/ src/
From: |
Yves . Renard |
Subject: |
[Getfem-commits] r5432 - in /trunk/getfem: ./ contrib/opt_assembly/ src/ src/gmm/ |
Date: |
Fri, 21 Oct 2016 14:20:06 -0000 |
Author: renard
Date: Fri Oct 21 16:20:04 2016
New Revision: 5432
URL: http://svn.gna.org/viewcvs/getfem?rev=5432&view=rev
Log:
disabling blas interface by default (option --enable-blas-interface) and small
optimizations
Modified:
trunk/getfem/configure.ac
trunk/getfem/contrib/opt_assembly/opt_assembly.cc
trunk/getfem/src/bgeot_geometric_trans.cc
trunk/getfem/src/getfem_generic_assembly.cc
trunk/getfem/src/gmm/gmm_blas_interface.h
Modified: trunk/getfem/configure.ac
URL:
http://svn.gna.org/viewcvs/getfem/trunk/getfem/configure.ac?rev=5432&r1=5431&r2=5432&view=diff
==============================================================================
--- trunk/getfem/configure.ac (original)
+++ trunk/getfem/configure.ac Fri Oct 21 16:20:04 2016
@@ -309,6 +309,19 @@
LIBS="$LIBS $BLAS_LIBS"
CPPFLAGS="$CPPFLAGS -DGMM_USES_BLAS"
+useblasinterface=NO
+AC_ARG_ENABLE(blas_interface,
+ [AS_HELP_STRING([--enable-blas-interface],[enable the use of the blas call
for basic algebra routines.])],
+ [case $enableval in
+ yes | "") useblasinterface=YES ;;
+ no) useblasinterface=NO ;;
+ esac],
+ [useblasinterface=NO]
+)
+
+if test x$useblasinterface = xYES; then
+ CPPFLAGS="$CPPFLAGS -DGMM_USES_BLAS_INTERFACE"
+fi
dnl ---------------------------OPENMP------------------------------
useopenmp=0
Modified: trunk/getfem/contrib/opt_assembly/opt_assembly.cc
URL:
http://svn.gna.org/viewcvs/getfem/trunk/getfem/contrib/opt_assembly/opt_assembly.cc?rev=5432&r1=5431&r2=5432&view=diff
==============================================================================
--- trunk/getfem/contrib/opt_assembly/opt_assembly.cc (original)
+++ trunk/getfem/contrib/opt_assembly/opt_assembly.cc Fri Oct 21 16:20:04 2016
@@ -250,7 +250,7 @@
bool all = false;
bool select = true;
- int only_one = 4;
+ int only_one = 5;
if (all || only_one == 1) {
VEC_TEST_1("Test for source term", ndofu, "u.Test_u", mim, size_type(-1),
@@ -438,40 +438,40 @@
// - Instructions execution except for assembly ones
// new | old | sto | asse | exec | Ins |
test_new_assembly(2, 400, 1); // ndofu = 321602 ndofp = 160801 ndofchi = 1201
- // Mass (scalar) : 0.18 | 0.61 | 0.04 | 0.06 | 0.06 | 0.06 |
+ // Mass (scalar) : 0.18 | 0.59 | 0.04 | 0.06 | 0.06 | 0.06 |
// Mass (vector) : 0.30 | 0.82 | 0.09 | 0.15 | 0.06 | 0.09 |
- // Laplacian : 0.16 | 0.83 | 0.04 | 0.05 | 0.06 | 0.05 |
+ // Laplacian : 0.16 | 0.80 | 0.04 | 0.05 | 0.06 | 0.05 |
// Homogeneous elas : 0.31 | 1.88 | 0.08 | 0.13 | 0.06 | 0.10 |
// Non-homogeneous elast: 0.37 | 2.26 | 0.09 | 0.16 | 0.06 | 0.15 |
test_new_assembly(3, 36, 1); // ndofu = 151959 ndofp = 50653 ndofchi = 6553
- // Mass (scalar) : 0.28 | 0.77 | 0.05 | 0.09 | 0.13 | 0.06 |
- // Mass (vector) : 0.76 | 1.54 | 0.17 | 0.29 | 0.13 | 0.34 |
- // Laplacian : 0.32 | 1.38 | 0.03 | 0.06 | 0.13 | 0.13 |
- // Homogeneous elas : 0.94 | 4.58 | 0.26 | 0.33 | 0.14 | 0.47 |
- // Non-homogeneous elast: 1.01 | 6.72 | 0.26 | 0.33 | 0.14 | 0.54 |
+ // Mass (scalar) : 0.27 | 0.75 | 0.05 | 0.08 | 0.13 | 0.06 |
+ // Mass (vector) : 0.74 | 1.54 | 0.17 | 0.27 | 0.13 | 0.34 |
+ // Laplacian : 0.29 | 1.37 | 0.03 | 0.06 | 0.13 | 0.10 |
+ // Homogeneous elas : 0.91 | 4.58 | 0.26 | 0.33 | 0.13 | 0.45 |
+ // Non-homogeneous elast: 0.98 | 6.72 | 0.26 | 0.33 | 0.13 | 0.52 |
test_new_assembly(2, 200, 2); // ndofu = 321602 ndofp = 160801 ndofchi = 1201
// Mass (scalar) : 0.09 | 0.25 | 0.02 | 0.03 | 0.03 | 0.03 |
// Mass (vector) : 0.26 | 0.44 | 0.05 | 0.09 | 0.03 | 0.14 |
// Laplacian : 0.09 | 0.37 | 0.02 | 0.03 | 0.03 | 0.03 |
// Homogeneous elas : 0.26 | 1.28 | 0.06 | 0.09 | 0.03 | 0.14 |
- // Non-homogeneous elast: 0.31 | 2.40 | 0.07 | 0.10 | 0.03 | 0.18 |
+ // Non-homogeneous elast: 0.31 | 2.38 | 0.07 | 0.10 | 0.03 | 0.18 |
test_new_assembly(3, 18, 2); // ndofu = 151959 ndofp = 50653 ndofchi = 6553
- // Mass (scalar) : 0.13 | 0.29 | 0.05 | 0.07 | 0.03 | 0.03 |
- // Mass (vector) : 1.18 | 0.90 | 0.21 | 0.37 | 0.03 | 0.78 |
+ // Mass (scalar) : 0.13 | 0.28 | 0.05 | 0.07 | 0.03 | 0.03 |
+ // Mass (vector) : 1.15 | 0.90 | 0.21 | 0.35 | 0.03 | 0.77 |
// Laplacian : 0.11 | 0.55 | 0.03 | 0.05 | 0.03 | 0.05 |
- // Homogeneous elas : 1.70 | 3.47 | 0.59 | 0.73 | 0.03 | 0.94 |
- // Non-homogeneous elast: 1.77 | 9.25 | 0.59 | 0.73 | 0.03 | 1.01 |
+ // Homogeneous elas : 1.69 | 3.41 | 0.59 | 0.73 | 0.03 | 0.93 |
+ // Non-homogeneous elast: 1.76 | 9.24 | 0.59 | 0.73 | 0.03 | 1.00 |
test_new_assembly(3, 9, 4); // ndofu = 151959 ndofp = 50653 ndofchi = 6553
// Mass (scalar) : 0.52 | 0.34 | 0.09 | 0.16 | 0.01 | 0.34 |
- // Mass (vector) : 4.46 | 1.32 | 0.41 | 1.30 | 0.01 | 3.15 |
- // Laplacian : 0.41 | 0.77 | 0.09 | 0.15 | 0.01 | 0.25 |
- // Homogeneous elas : 8.99 | 5.26 | 0.88 | 1.43 | 0.01 | 7.55 |
- // Non-homogeneous elast: 9.14 | 48.0 | 0.76 | 1.41 | 0.01 | 7.72 |
+ // Mass (vector) : 4.38 | 1.31 | 0.41 | 1.27 | 0.01 | 3.10 |
+ // Laplacian : 0.40 | 0.77 | 0.09 | 0.14 | 0.01 | 0.25 |
+ // Homogeneous elas : 8.97 | 5.25 | 0.88 | 1.43 | 0.01 | 7.53 |
+ // Non-homogeneous elast: 9.01 | 47.7 | 0.76 | 1.35 | 0.01 | 7.65 |
// Conclusions :
// - Desactivation of debug test has no sensible effect.
// - Compile time of assembly strings is negligible (< 0.0004)
- // - J computation takes half the computational time of the exec part
+ // - (J, K, B) computation takes half the computational time of the exec part
// - The optimized instruction call is negligible
// - For uniform mesh_fem, the resize operations has been suppressed and
// the "update pfp" has been isolated in a set of instruction being
@@ -484,11 +484,10 @@
// - Optimization of J computation (especially in 2D and standard cases)
// - Unroll loop in used instructions
// - Assembly optimization
- // - Detection of the very simples cases where the elementary atrix do not
+ // - Detection of the very simples cases where the elementary matrix do not
// have to be computed on each element (mass matrix, laplacian ...)
// on uniform mesh_fem and mesh_im ?
// - storage optimization (matrices ...)
- // - Fem interpolation context optimization.
// - Why such a difference between mass matrix and laplacian for 3D and P2 ?
// Original table :
Modified: trunk/getfem/src/bgeot_geometric_trans.cc
URL:
http://svn.gna.org/viewcvs/getfem/trunk/getfem/src/bgeot_geometric_trans.cc?rev=5432&r1=5431&r2=5432&view=diff
==============================================================================
--- trunk/getfem/src/bgeot_geometric_trans.cc (original)
+++ trunk/getfem/src/bgeot_geometric_trans.cc Fri Oct 21 16:20:04 2016
@@ -35,16 +35,16 @@
size_type N=gmm::mat_nrows(G), P=gmm::mat_nrows(pc), Q=gmm::mat_ncols(pc);
if (N && P && Q) {
auto itK = K.begin();
-
- for (size_type j = 0; j < Q; ++j)
- for (size_type i = 0; i < N; ++i) {
- auto itG = G.begin() + i;
- auto itpc = pc.begin() + j*P;
- register scalar_type a = *(itG) * (*itpc++); itG += N;
- for (size_type k = 1; k < P; ++k, itG += N)
- a += *(itG) * (*itpc++);
+ for (size_type j = 0; j < Q; ++j) {
+ auto itpc_j = pc.begin() + j*P, itG_b = G.begin();
+ for (size_type i = 0; i < N; ++i, ++itG_b) {
+ auto itG = itG_b, itpc = itpc_j;
+ register scalar_type a = *(itG) * (*itpc);
+ for (size_type k = 1; k < P; ++k)
+ { itG += N; a += *(itG) * (*++itpc); }
*itK++ = a;
}
+ }
} else gmm::clear(K);
}
Modified: trunk/getfem/src/getfem_generic_assembly.cc
URL:
http://svn.gna.org/viewcvs/getfem/trunk/getfem/src/getfem_generic_assembly.cc?rev=5432&r1=5431&r2=5432&view=diff
==============================================================================
--- trunk/getfem/src/getfem_generic_assembly.cc (original)
+++ trunk/getfem/src/getfem_generic_assembly.cc Fri Oct 21 16:20:04 2016
@@ -3289,25 +3289,32 @@
size_type target_dim = Z.sizes()[1];
size_type Qmult = qdim / target_dim;
if (Qmult == 1) {
- gmm::copy(Z.as_vector(), t.as_vector());
+ auto itt = Z.begin(); auto it = t.begin(), ite = t.end();
+ size_type nd = ((t.size()) >> 2);
+ for (size_type i = 0; i < nd; ++i) {
+ *it++ = (*itt++); *it++ = (*itt++);
+ *it++ = (*itt++); *it++ = (*itt++);
+ }
+ for (; it != ite;) *it++ = (*itt++);
+ // gmm::copy(Z.as_vector(), t.as_vector());
} else {
size_type ndof = Z.sizes()[0];
GA_DEBUG_ASSERT(t.size() == Z.size() * Qmult * Qmult,
"Wrong size for base vector");
gmm::clear(t.as_vector());
- base_tensor::const_iterator itZ = Z.begin();
- size_type s = t.sizes()[0], ss = s * Qmult, sss = s+1;
-
- // Performs t(i*Qmult+j, k*Qmult + j) = Z(i,k);
- for (size_type k = 0; k < target_dim; ++k) {
- base_tensor::iterator it = t.begin() + (ss * k);
- for (size_type i = 0; i < ndof; ++i, ++itZ) {
- if (i) it += Qmult;
- base_tensor::iterator it2 = it;
- *it2 = *itZ;
- for (size_type j = 1; j < Qmult; ++j) { it2 += sss; *it2 = *itZ; }
- }
- }
+ auto itZ = Z.begin();
+ size_type s = t.sizes()[0], ss = s * Qmult, sss = s+1;
+
+ // Performs t(i*Qmult+j, k*Qmult + j) = Z(i,k);
+ for (size_type k = 0; k < target_dim; ++k) {
+ auto it = t.begin() + (ss * k);
+
+ for (size_type i = 0; i < ndof; ++i, ++itZ, it += Qmult) {
+ auto it2 = it;
+ *it2 = *itZ;
+ for (size_type j = 1; j < Qmult; ++j) { it2 += sss; *it2 = *itZ; }
+ }
+ }
}
return 0;
}
@@ -3323,7 +3330,14 @@
size_type target_dim = Z.sizes()[1];
size_type Qmult = qdim / target_dim;
if (Qmult == 1) {
- gmm::copy(Z.as_vector(), t.as_vector());
+ auto itt = Z.begin(); auto it = t.begin(), ite = t.end();
+ size_type nd = ((t.size()) >> 2);
+ for (size_type i = 0; i < nd; ++i) {
+ *it++ = (*itt++); *it++ = (*itt++);
+ *it++ = (*itt++); *it++ = (*itt++);
+ }
+ for (; it != ite;) *it++ = (*itt++);
+ // gmm::copy(Z.as_vector(), t.as_vector());
} else {
size_type ndof = Z.sizes()[0];
size_type N = Z.sizes()[2];
@@ -3332,14 +3346,13 @@
gmm::clear(t.as_vector());
base_tensor::const_iterator itZ = Z.begin();
size_type s = t.sizes()[0], ss = s * Qmult, sss = s+1;
- size_type ssss=ss*target_dim;
+ size_type ssss = ss*target_dim;
// Performs t(i*Qmult+j, k*Qmult + j, l) = Z(i,k,l);
for (size_type l = 0; l < N; ++l)
for (size_type k = 0; k < target_dim; ++k) {
base_tensor::iterator it = t.begin() + (ss * k + ssss*l);
- for (size_type i = 0; i < ndof; ++i, ++itZ) {
- if (i) it += Qmult;
+ for (size_type i = 0; i < ndof; ++i, ++itZ, it += Qmult) {
base_tensor::iterator it2 = it;
*it2 = *itZ;
for (size_type j = 1; j < Qmult; ++j) { it2 += sss; *it2 = *itZ;
}
@@ -4046,7 +4059,15 @@
const base_tensor &tc1;
virtual int exec() {
GA_DEBUG_INFO("Instruction: tensor copy");
- gmm::copy(tc1.as_vector(), t.as_vector());
+
+ auto itt = tc1.begin(); auto it = t.begin(), ite = t.end();
+ size_type nd = ((t.size()) >> 2);
+ for (size_type i = 0; i < nd; ++i) {
+ *it++ = (*itt++); *it++ = (*itt++);
+ *it++ = (*itt++); *it++ = (*itt++);
+ }
+ for (; it != ite;) *it++ = (*itt++);
+ // gmm::copy(tc1.as_vector(), t.as_vector());
return 0;
}
ga_instruction_copy_tensor(base_tensor &t_, const base_tensor &tc1_)
@@ -5414,9 +5435,23 @@
GA_DEBUG_INFO("Instruction: vector term assembly for fem variable");
if (ipt == 0 || interpolate) {
elem.resize(t.size());
- gmm::copy(gmm::scaled(t.as_vector(), coeff), elem);
+ auto itt = t.begin(); auto it = elem.begin(), ite = elem.end();
+ size_type nd = ((t.size()) >> 2);
+ for (size_type i = 0; i < nd; ++i) {
+ *it++ = (*itt++) * coeff; *it++ = (*itt++) * coeff;
+ *it++ = (*itt++) * coeff; *it++ = (*itt++) * coeff;
+ }
+ for (; it != ite;) *it++ = (*itt++) * coeff;
+ // gmm::copy(gmm::scaled(t.as_vector(), coeff), elem);
} else {
- gmm::add(gmm::scaled(t.as_vector(), coeff), elem);
+ auto itt = t.begin(); auto it = elem.begin(), ite = elem.end();
+ size_type nd = ((t.size()) >> 2);
+ for (size_type i = 0; i < nd; ++i) {
+ *it++ += (*itt++) * coeff; *it++ += (*itt++) * coeff;
+ *it++ += (*itt++) * coeff; *it++ += (*itt++) * coeff;
+ }
+ for (; it != ite;) *it++ += (*itt++) * coeff;
+ // gmm::add(gmm::scaled(t.as_vector(), coeff), elem);
}
if (ipt == nbpt-1 || interpolate) {
const mesh_fem &mf = *(mfg ? *mfg : mfn);
@@ -5597,7 +5632,7 @@
elem.resize(t.size());
auto itt = t.begin(); auto it = elem.begin(), ite = elem.end();
scalar_type e = coeff*alpha1*alpha2;
- size_type nd = t.size() / 4;
+ size_type nd = ((t.size()) >> 2);
for (size_type i = 0; i < nd; ++i) {
*it++ = (*itt++) * e; *it++ = (*itt++) * e;
*it++ = (*itt++) * e; *it++ = (*itt++) * e;
@@ -5608,7 +5643,7 @@
// Faster than a daxpy blas call on my config
auto itt = t.begin(); auto it = elem.begin(), ite = elem.end();
scalar_type e = coeff*alpha1*alpha2;
- size_type nd = t.size() / 4;
+ size_type nd = ((t.size()) >> 2);
for (size_type i = 0; i < nd; ++i) {
*it++ += (*itt++) * e; *it++ += (*itt++) * e;
*it++ += (*itt++) * e; *it++ += (*itt++) * e;
@@ -5724,7 +5759,7 @@
elem.resize(t.size());
auto itt = t.begin(); auto it = elem.begin(), ite = elem.end();
scalar_type e = coeff*alpha1*alpha2;
- size_type nd = t.size() / 4;
+ size_type nd = ((t.size()) >> 2);
for (size_type i = 0; i < nd; ++i) {
*it++ = (*itt++) * e; *it++ = (*itt++) * e;
*it++ = (*itt++) * e; *it++ = (*itt++) * e;
@@ -5735,7 +5770,7 @@
// Faster than a daxpy blas call on my config
auto itt = t.begin(); auto it = elem.begin(), ite = elem.end();
scalar_type e = coeff*alpha1*alpha2;
- size_type nd = t.size() / 4;
+ size_type nd = ((t.size()) >> 2);
for (size_type i = 0; i < nd; ++i) {
*it++ += (*itt++) * e; *it++ += (*itt++) * e;
*it++ += (*itt++) * e; *it++ += (*itt++) * e;
@@ -5811,8 +5846,10 @@
elem.resize(t.size());
auto itt = t.begin(); auto it = elem.begin(), ite = elem.end();
scalar_type e = coeff*alpha1*alpha2;
- size_type nd = t.size() / 4;
+ size_type nd = ((t.size()) >> 3);
for (size_type i = 0; i < nd; ++i) {
+ *it++ = (*itt++) * e; *it++ = (*itt++) * e;
+ *it++ = (*itt++) * e; *it++ = (*itt++) * e;
*it++ = (*itt++) * e; *it++ = (*itt++) * e;
*it++ = (*itt++) * e; *it++ = (*itt++) * e;
}
@@ -5822,8 +5859,10 @@
// (Far) faster than a daxpy blas call on my config.
auto itt = t.begin(); auto it = elem.begin(), ite = elem.end();
scalar_type e = coeff*alpha1*alpha2;
- size_type nd = t.size() / 4;
+ size_type nd = ((t.size()) >> 3);
for (size_type i = 0; i < nd; ++i) {
+ *it++ += (*itt++) * e; *it++ += (*itt++) * e;
+ *it++ += (*itt++) * e; *it++ += (*itt++) * e;
*it++ += (*itt++) * e; *it++ += (*itt++) * e;
*it++ += (*itt++) * e; *it++ += (*itt++) * e;
}
Modified: trunk/getfem/src/gmm/gmm_blas_interface.h
URL:
http://svn.gna.org/viewcvs/getfem/trunk/getfem/src/gmm/gmm_blas_interface.h?rev=5432&r1=5431&r2=5432&view=diff
==============================================================================
--- trunk/getfem/src/gmm/gmm_blas_interface.h (original)
+++ trunk/getfem/src/gmm/gmm_blas_interface.h Fri Oct 21 16:20:04 2016
@@ -46,6 +46,10 @@
#include "gmm_matrix.h"
namespace gmm {
+
+ // Due to poor performance of level 1 and 2 blas on tested configurations,
+ // this interface is deactivated by default.
+ // Use ./configure --enable-blas-interface to activate it.
#define GMMLAPACK_TRACE(f)
// #define GMMLAPACK_TRACE(f) cout << "function " << f << " called" << endl;
@@ -168,6 +172,8 @@
void sger_(...); void dger_(...); void cgerc_(...); void zgerc_(...);
}
+#if defined(GMM_USES_BLAS_INTERFACE)
+
/* ********************************************************************* */
/* vect_norm2(x). */
/* ********************************************************************* */
@@ -936,6 +942,7 @@
trsv_interface(upper_tri_solve, trsv_lower, gem_p1_c, gem_trans1_c,
ztrsv_, BLAS_Z)
+#endif
}
#endif // GMM_BLAS_INTERFACE_H
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Getfem-commits] r5432 - in /trunk/getfem: ./ contrib/opt_assembly/ src/ src/gmm/,
Yves . Renard <=