[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Devel] Question about optimizing FT_MulFix.
From: |
David Turner |
Subject: |
Re: [Devel] Question about optimizing FT_MulFix. |
Date: |
Mon, 30 Apr 2001 16:00:09 +0200 |
Hi Tom,
Tom Kacvinsky a écrit :
>
> I don't understand the use of adding 0x8000, and the sign changing.
>
This is for rounding, which happens to be asymetrical, i.e.
round(x) == ( x >= 0 ? trunc(x+0.5) : -trunc(-x+0.5) );
Note that the result depends on the sign of "x". Now, if
we want to compute the rounded value of A*2^16/B, where
"/" is the normal division, and "div" the integer division:
A div B == trunc( A/B )
trunc( A/B+0.5 ) == trunc( (A+B/2)/B ) == (A+B/2) div B
we could also write something like the following for
FT_MulFix:
if ( A^B >= 0 ) // A^B has the same sign as A*B or A/B
{
result = (FT_Int32)(((FT_INT64)a*b + 0x8000) >> 16);
}
else
{
result = -(FT_Int32)(((FT_Int64)(-a)*b + 0x8000)/b) >> 16);
}
however, it seems to be slower in some cases (read below)
> I think this is just as accurate:
>
> #ifdef FT_LONG64
>
> FT_EXPORT_DEF( FT_Long ) FT_MulFix( FT_Long a,
> FT_Long b )
> {
>
> return (FT_Long)( ( (FT_Int64)a * b ) >> 16 );
> }
>
> #endif
>
> Moreover, we avoid the two comparisons, the addition of 0x8000, and
> the multiplication by s. This has to be worth *some* optimization.
>
It will not give you the same results however, and I believe that
proper TrueType hinting _needs_ rounding..
On the other hand, I've never though very much about the 64-bit
version of FT_MulFix. I've just wrote a small benchmark to compare
the current implementation with several alternatives. It seems
we can easily gain a 12-15% speedup in this function by getting
rid of the multiplication by 's'..
Here's a (small) table of results:
For a 266 Mobile Pentium:
gcc 2.95.3 (Mandrake 7.2) with -O6 -mpentium mult_fix4 wins
gcc 2.95.2 (Windows) with -O6 -mpentium mult_fix3 wins
visual C++ 6.0 with /G5 /Ox mult_fix3 wins
borland C++ with -O2 -5 mult_fix3 wins
For a 700 Mhz Pentium III
gcc 2.95.3 (Mandrake 7.2) with -O6 -mpentiumpro mult_fix2 wins
(however "mult_fix3" is slightly behind it)
"mult_fix2" corresponds to the original code in FT_MulFix, and it's
interesting to see that it's the fastest on a "modern" processor
like a PIII. I'd like to see the results for a PowerPC or an Alpha
though, just to get sure of what's hapenning..
Apparently, it seems that "mult_fix3" is a winner, because it
povides a 12-15% speedup on "old" processors, whild loses less
than 1% of performance compared to the original FT_MulFix on
a Pentium III..
I'd like to have more data however to be certain about that..
Testers are then welcome :-)
Regards,
- David
here's the source code for the benchmark. Note that you'll need
the FreeType 2 headers (not the library), and that this will
probably not compile if you don't have FT_CONFIG_OPTION_FORCE_INT64
defined in your <freetype/config/ftoption.h>
----------------- fixmul64.c -----------------------------------
#include <ft2build.h>
#include FT_FREETYPE_H
#ifndef FT_INT64
#define FT_INT64 long long
#endif
/* version 1 is a simple direct implementation */
extern FT_Int32
mult_fix1( FT_Int32 a, FT_Int32 b )
{
FT_Int32 result;
FT_INT64 val;
if ( a^b >= 0 ) /* same sign as A/B, easy to compute */
{
val = (FT_INT64)a * b;
result = (FT_Int32)((val + 0x8000) >> 16);
}
else
{
val = - (FT_INT64)a * b;
result = - (FT_Int32)((val + 0x8000) >> 16);
}
return result;
}
/* version 2 if the original code used in FT_MulFix */
/* on modern processors, multiplying by a value that is 1 */
/* or -1 is extremely fast.. */
extern FT_Int32
mult_fix2( FT_Int32 a, FT_Int32 b )
{
FT_Int32 s = 1;
FT_Int32 result;
if ( a < 0 ) { a = -a; s = -1; };
if ( b < 0 ) { b = -b; s = -s; };
result = ((FT_INT64)a * b + 0x8000) >> 16;
return s*result;
}
/* version 3 is a variation of version 2 where we do not */
/* use a multiplication to compute the end result.. */
/* that seems to be faster in less "modern" processors, */
/* like a Pentium.. */
extern FT_Int32
mult_fix3( FT_Int32 a, FT_Int32 b )
{
FT_Int32 s = 1;
FT_Int32 result;
if ( a < 0 ) { a = -a; s = -1; };
if ( b < 0 ) { b = -b; s = -s; };
result = ((FT_INT64)a * b + 0x8000) >> 16;
return (s < 0) ? -result : result;
}
/* version 4 is slighty different to version 3, in the way */
/* "s" is computed.. */
/* a good optimizing compiler on a modern processor should */
/* be able to use conditional moves to implement the ? : */
/* */
extern FT_Int32
mult_fix4( FT_Int32 a, FT_Int32 b )
{
FT_Int32 s;
FT_Int32 result;
s = a^b;
a = (a < 0) ? -a : a;
b = (b < 0) ? -b : b;
result = ((FT_INT64)a * b + 0x8000) >> 16;
return (s < 0) ? -result : result;
}
/* version 5 is a variation of version 4 that uses the fact */
/* that abs(x) = ( x^(x >> 31) ) - (x >> 31).. this avoids */
/* the use of jumps when implementing ? : */
/* */
extern FT_Int32
mult_fix5( FT_Int32 a, FT_Int32 b )
{
FT_Int32 s, c, d;
FT_Int32 result;
s = a^b;
c = (a >> 31); a = (a^c) - c;
d = (b >> 31); b = (b^d) - d;
result = ((FT_INT64)a * b + 0x8000) >> 16;
c = s >> 31;
result = (result^c) - c;
return result;
}
---------------------- end of fixmul64.c -------------------------------
-------------------------- test_fixmul.c --------------------------------
#include <ft2build.h>
#include FT_FREETYPE_H
#include <time.h> /* for clock() */
/* SunOS 4.1.* does not define CLOCKS_PER_SEC, so include <sys/param.h>
*/
/* to get the HZ macro which is the equivalent.
*/
#if defined(__sun__) && !defined(SVR4) && !defined(__SVR4)
#include <sys/param.h>
#define CLOCKS_PER_SEC HZ
#endif
static long
get_time( void )
{
return clock() * 10000L / CLOCKS_PER_SEC;
}
extern FT_Int32
mult_fix1( FT_Int32 a, FT_Int32 b );
extern FT_Int32
mult_fix2( FT_Int32 a, FT_Int32 b );
extern FT_Int32
mult_fix3( FT_Int32 a, FT_Int32 b );
extern FT_Int32
mult_fix4( FT_Int32 a, FT_Int32 b );
extern FT_Int32
mult_fix5( FT_Int32 a, FT_Int32 b );
#define REPEAT 80000L
#define SIZE 1000
static void
test( void )
{
static FT_Int32 vals1[SIZE];
static FT_Int32 vals2[SIZE];
static FT_Int32 vals3[SIZE];
static FT_Int32 vals4[SIZE];
int i, j;
double t0;
/* initialise array */
for ( i = 0; i < SIZE; i++ )
vals1[i] = (FT_Int32)(65536.0*rand() - 32768.0);
for ( i = 0; i < SIZE; i++ )
vals2[i] = (FT_Int32)(65536.0*rand() - 32768.0);
t0 = get_time();
for ( j = 0; j < REPEAT; j++ )
for ( i = 0; i < SIZE; i++ )
vals3[i] = mult_fix1( vals1[i], vals2[i] );
t0 = get_time() - t0;
printf( "mult_fix1 : time = %.5f\n", t0/10000.0 );
t0 = get_time();
for ( j = 0; j < REPEAT; j++ )
for ( i = 0; i < SIZE; i++ )
vals4[i] = mult_fix2( vals1[i], vals2[i] );
t0 = get_time() - t0;
printf( "mult_fix2 : time = %.5f\n", t0/10000.0 );
t0 = get_time();
for ( j = 0; j < REPEAT; j++ )
for ( i = 0; i < SIZE; i++ )
vals4[i] = mult_fix3( vals1[i], vals2[i] );
t0 = get_time() - t0;
printf( "mult_fix3 : time = %.5f\n", t0/10000.0 );
t0 = get_time();
for ( j = 0; j < REPEAT; j++ )
for ( i = 0; i < SIZE; i++ )
vals4[i] = mult_fix4( vals1[i], vals2[i] );
t0 = get_time() - t0;
printf( "mult_fix4 : time = %.5f\n", t0/10000.0 );
t0 = get_time();
for ( j = 0; j < REPEAT; j++ )
for ( i = 0; i < SIZE; i++ )
vals4[i] = mult_fix5( vals1[i], vals2[i] );
t0 = get_time() - t0;
printf( "mult_fix5 : time = %.5f\n", t0/10000.0 );
/* check */
for ( i = 0; i < SIZE; i++ )
if ( vals3[i] != vals4[i] )
{
printf( "invalid match : A = %8x (%.4f), B = %8x (%.4f)\n",
vals1[i], vals1[i]/65536.0, vals2[i], vals2[i]/65536.0
);
printf( " mult_fix1(A,B) = %8x (%.4f)\n", vals3[i],
vals3[i]/65536.0 );
printf( " mult_fix2(A,B) = %8x (%.4f)\n", vals4[i],
vals4[i]/65536.0 );
}
}
int main( int argc, char** argv )
{
test();
return 0;
}
------------------------------ end of fixmul.c ------------------------------