[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Bug-gnubg] Vectorizing 2nd step
From: |
Øystein Johansen |
Subject: |
[Bug-gnubg] Vectorizing 2nd step |
Date: |
Sat, 16 Apr 2005 18:47:05 +0200 |
User-agent: |
Mozilla Thunderbird 0.8 (Windows/20040913) |
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Here's a patch for i386 / GCC vectorizing of the inner loops of
Evaluate(). I see some improvement, but I believe this can be improved
even further.
Some comments:
I believe having a integer counter in the loop slows it down. Can I exit
the loop in an other way?
I initialize a vector for scaling in the second loop. I believe this can
be made simpler. Any suggestions?
Please comment on this two issues.
I get an internal compiler error whin I compile with a gcc-4.1 snapshot.
This is compiled with gcc-3.4. I will also vectorice the other loops in
the evaluation function. And I will also vectorize the sigmoid function.
- -Øystein
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.4 (MingW32)
Comment: Using GnuPG with Thunderbird - http://enigmail.mozdev.org
iD8DBQFCYUGJ6kDTFPhwyqYRAv9IAJoCTyBmxfknnxaIyezApxtYkfLP9gCePLdx
kiN6PdG/t/RW54PI20mLPJ0=
=kjeM
-----END PGP SIGNATURE-----
Index: neuralnet.c
===================================================================
RCS file: /cvsroot/gnubg/gnubg/lib/neuralnet.c,v
retrieving revision 1.23
diff -u -r1.23 neuralnet.c
--- neuralnet.c 25 Feb 2005 11:34:24 -0000 1.23
+++ neuralnet.c 16 Apr 2005 16:43:36 -0000
@@ -444,6 +444,21 @@
return 0;
}
+typedef float v4sf __attribute__ ((vector_size(16)));
+
+typedef union _vec4f {
+ v4sf v;
+ float f[4];
+} vec4f;
+
+#if DEBUG
+void
+printvec( v4sf vec ){
+ float *pFloat = ( float *) &vec;
+ printf("%f, %f, %f, %f\n", pFloat[0], pFloat[1], pFloat[2], pFloat[3]);
+}
+#endif
+
static int Evaluate( neuralnet *pnn, float arInput[], float ar[],
float arOutput[], float *saveAr ) {
@@ -452,6 +467,9 @@
#else
int i, j;
float *prWeight;
+ v4sf sum, vec0, vec1, vec3;
+
+ assert(pnn->cHidden == 128);
/* Calculate activity at hidden nodes */
for( i = 0; i < pnn->cHidden; i++ )
@@ -466,11 +484,28 @@
float *pr = ar;
if( ari == 1.0f )
- for( j = pnn->cHidden; j; j-- )
- *pr++ += *prWeight++;
- else
- for( j = pnn->cHidden; j; j-- )
- *pr++ += *prWeight++ * ari;
+ for( j = 32; j; j--, pr += 4, prWeight += 4 ){
+ vec0 = __builtin_ia32_loadups(pr);
+ vec1 = __builtin_ia32_loadups(prWeight);
+ sum = __builtin_ia32_addps(vec0, vec1);
+ __builtin_ia32_storeups (pr, sum);
+ }
+// *pr++ += *prWeight++;
+ else {
+ float scale[4];
+ v4sf scalevector;
+ scale[0] = scale[1] = scale[2] = scale[3] = ari;
+ scalevector = __builtin_ia32_loadups(scale);
+ for( j = 32; j; j--, pr += 4, prWeight += 4 ){
+ vec0 = __builtin_ia32_loadups(pr);
+ vec1 = __builtin_ia32_loadups(prWeight);
+ vec3 = __builtin_ia32_mulps(vec1, scalevector);
+ sum = __builtin_ia32_addps(vec0, vec3);
+ __builtin_ia32_storeups (pr, sum);
+ }
+// for( j = pnn->cHidden; j; j-- )
+// *pr++ += *prWeight++ * ari;
+ }
} else
prWeight += pnn->cHidden;
}
@@ -484,14 +519,22 @@
/* Calculate activity at output nodes */
prWeight = pnn->arOutputWeight;
-
+
for( i = 0; i < pnn->cOutput; i++ ) {
- float r = pnn->arOutputThreshold[ i ];
-
- for( j = 0; j < pnn->cHidden; j++ )
- r += ar[ j ] * *prWeight++;
-
- arOutput[ i ] = sigmoid( -pnn->rBetaOutput * r );
+ float r = pnn->arOutputThreshold[ i ];
+ float *pr = ar;
+ vec4f sum;
+ v4sf vec0, vec1, vec3;
+ sum.v = __builtin_ia32_xorps(sum.v, sum.v);
+ for( j = 32; j ; j--, prWeight += 4, pr += 4 ){
+ vec0 = __builtin_ia32_loadups(pr); /* Four floats into vec0 */
+ vec1 = __builtin_ia32_loadups(prWeight); /* Four weights into vect1
*/
+ vec3 = __builtin_ia32_mulps(vec0, vec1); /* Multiply */
+ sum.v = __builtin_ia32_addps(sum.v, vec3); /* Add */
+ }
+
+ r += sum.f[0] + sum.f[1] + sum.f[2] + sum.f[3];
+ arOutput[ i ] = sigmoid( -pnn->rBetaOutput * r );
}
return 0;