[Git][freetype/freetype][split-conic-dda] [smooth] Implement Bezier quad

David Turner pushed to branch split-conic-dda at FreeType / FreeType

Commits:

76b93a01

by David Turner at 2021-07-12T10:17:02+02:00

[smooth] Implement Bezier quadratic arc flattenning with DDA

Benchmarking shows that this provides a very slighty performance
boost when rendering fonts with lots of quadratic bezier arcs,
compared to the recursive arc splitting, but only when SSE2 is
available, or on 64-bit CPUs.

On a 2017 Core i5-7300U CPU on Linux/x86_64:

  ./ftbench -p -s10 -t5 -cb .../DroidSansFallbackFull.ttf

    Before: 4.033 us/op  (best of 5 runs for all numbers)
    After:  3.876 us/op

  ./ftbench -p -s60 -t5 -cb .../DroidSansFallbackFull.ttf

    Before: 13.467 us/op
    After:  13.385 us/op

2 changed files:

ChangeLog
src/smooth/ftgrays.c

Changes:

ChangeLog

 +2021-07-12  David Turner  <david@freetype.org>
++
 +	[smooth] Implement Bezier quadratic arc flattenning with DDA
++
 +	Benchmarking shows that this provides a very slighty performance
 +	boost when rendering fonts with lots of quadratic bezier arcs,
 +	compared to the recursive arc splitting, but only when SSE2 is
 +	available, or on 64-bit CPUs.
++
 +	* src/smooth/ftgrays.c (gray_render_conic): New implementation
 +	based on DDA and optionally SSE2.
++
 -07-12  David Turner  <david@freetype.org>
  	[smooth] Minor speedup to smooth rasterizer

src/smooth/ftgrays.c

@@ -985,6 +985,188 @@ typedef ptrdiff_t  FT_PtrDist;
  #endif
 +/* Benchmarking shows that using DDA to flatten the quadratic bezier
 + * arcs is slightly faster in the following cases:
 + *
 + *   - When the host CPU is 64-bit.
 + *   - When SSE2 SIMD registers and instructions are available (even on x86).
 + *
 + * For other cases, using binary splits is actually slightly faster.
 + */
 +#if defined(__SSE2__) || defined(__x86_64__) || defined(__aarch64__) || defined(_M_AMD64) || defined(_M_ARM64)
 +#define BEZIER_USE_DDA  1
 +#else
 +#define BEZIER_USE_DDA  0
 +#endif
++
 +#if BEZIER_USE_DDA
++
 +#include <emmintrin.h>
++
 +  static void
 +  gray_render_conic( RAS_ARG_ const FT_Vector*  control,
 +                              const FT_Vector*  to )
 +  {
 +    FT_Vector  p0, p1, p2;
++
 +    p0.x = ras.x;
 +    p0.y = ras.y;
 +    p1.x = UPSCALE( control->x );
 +    p1.y = UPSCALE( control->y );
 +    p2.x = UPSCALE( to->x );
 +    p2.y = UPSCALE( to->y );
++
 +    /* short-cut the arc that crosses the current band */
 +    if ( ( TRUNC( p0.y ) >= ras.max_ey &&
 +           TRUNC( p1.y ) >= ras.max_ey &&
 +           TRUNC( p2.y ) >= ras.max_ey ) ||
 +         ( TRUNC( p0.y ) <  ras.min_ey &&
 +           TRUNC( p1.y ) <  ras.min_ey &&
 +           TRUNC( p2.y ) <  ras.min_ey ) )
 +    {
 +      ras.x = p2.x;
 +      ras.y = p2.y;
 +      return;
 +    }
++
 +    TPos dx = FT_ABS( p0.x + p2.x - 2 * p1.x );
 +    TPos dy = FT_ABS( p0.y + p2.y - 2 * p1.y );
 +    if ( dx < dy )
 +      dx = dy;
++
 +    if ( dx <= ONE_PIXEL / 4 )
 +    {
 +      gray_render_line( RAS_VAR_ p2.x, p2.y );
 +      return;
 +    }
++
 +    /* We can calculate the number of necessary bisections because  */
 +    /* each bisection predictably reduces deviation exactly 4-fold. */
 +    /* Even 32-bit deviation would vanish after 16 bisections.      */
 +    int shift = 0;
 +    do
 +    {
 +      dx   >>= 2;
 +      shift += 1;
 +    }
 +    while (dx > ONE_PIXEL / 4);
++
 +    /*
 +     * The (P0,P1,P2) arc equation, for t in [0,1] range:
 +     *
 +     * P(t) = P0*(1-t)^2 + P1*2*t*(1-t) + P2*t^2
 +     *
 +     * P(t) = P0 + 2*(P1-P0)*t + (P0+P2-2*P1)*t^2
 +     *      = P0 + 2*B*t + A*t^2
 +     *
 +     *    for A = P0 + P2 - 2*P1
 +     *    and B = P1 - P0
 +     *
 +     * Let's consider the difference when advancing by a small
 +     * parameter h:
 +     *
 +     *    Q(h,t) = P(t+h) - P(t) = 2*B*h + A*h^2 + 2*A*h*t
 +     *
 +     * And then its own difference:
 +     *
 +     *    R(h,t) = Q(h,t+h) - Q(h,t) = 2*A*h*h = R (constant)
 +     *
 +     * Since R is always a constant, it is possible to compute
 +     * successive positions with:
 +     *
 +     *     P = P0
 +     *     Q = Q(h,0) = 2*B*h + A*h*h
 +     *     R = 2*A*h*h
 +     *
 +     *   loop:
 +     *     P += Q
 +     *     Q += R
 +     *     EMIT(P)
 +     *
 +     * To ensure accurate results, perform computations on 64-bit
 +     * values, after scaling them by 2^32:
 +     *
 +     *     R << 32   = 2 * A << (32 - N - N)
 +     *               = A << (33 - 2 *N)
 +     *
 +     *     Q << 32   = (2 * B << (32 - N)) + (A << (32 - N - N))
 +     *               = (B << (33 - N)) + (A << (32 - N - N))
 +     */
 +#ifdef __SSE2__
 +    /* Experience shows that for small shift values, SSE2 is actually slower. */
 +    if (shift > 2) {
 +      union {
 +        struct { FT_Int64 ax, ay, bx, by; } i;
 +        struct { __m128i a, b; } vec;
 +      } u;
++
 +      u.i.ax = p0.x + p2.x - 2 * p1.x;
 +      u.i.ay = p0.y + p2.y - 2 * p1.y;
 +      u.i.bx = p1.x - p0.x;
 +      u.i.by = p1.y - p0.y;
++
 +      __m128i a = _mm_load_si128(&u.vec.a);
 +      __m128i b = _mm_load_si128(&u.vec.b);
++
 +      __m128i r = _mm_slli_epi64(a, 33 - 2 * shift);
 +      __m128i q = _mm_slli_epi64(b, 33 - shift);
 +      __m128i q2 = _mm_slli_epi64(a, 32 - 2 * shift);
 +      q = _mm_add_epi64(q2, q);
++
 +      union {
 +        struct { FT_Int32  px_lo, px_hi, py_lo, py_hi; } i;
 +        __m128i vec;
 +      } v;
 +      v.i.px_lo = 0;
 +      v.i.px_hi = p0.x;
 +      v.i.py_lo = 0;
 +      v.i.py_hi = p0.y;
++
 +      __m128i p = _mm_load_si128(&v.vec);
++
 +      for (unsigned count = (1u << shift); count > 0; count--) {
 +        p = _mm_add_epi64(p, q);
 +        q = _mm_add_epi64(q, r);
++
 +        _mm_store_si128(&v.vec, p);
++
 +        gray_render_line( RAS_VAR_ v.i.px_hi, v.i.py_hi);
 +      }
 +      return;
 +    }
 +#endif  /* !__SSE2__ */
 +    FT_Int64 ax = p0.x + p2.x - 2 * p1.x;
 +    FT_Int64 ay = p0.y + p2.y - 2 * p1.y;
 +    FT_Int64 bx = p1.x - p0.x;
 +    FT_Int64 by = p1.y - p0.y;
++
 +    FT_Int64 rx = ax << (33 - 2 * shift);
 +    FT_Int64 ry = ay << (33 - 2 * shift);
++
 +    FT_Int64 qx = (bx << (33 - shift)) + (ax << (32 - 2 * shift));
 +    FT_Int64 qy = (by << (33 - shift)) + (ay << (32 - 2 * shift));
++
 +    FT_Int64 px = (FT_Int64)p0.x << 32;
 +    FT_Int64 py = (FT_Int64)p0.y << 32;
++
 +	FT_UInt count = 1u << shift;
++
 +    for (; count > 0; count--) {
 +      px += qx;
 +      py += qy;
 +      qx += rx;
 +      qy += ry;
++
 +      gray_render_line( RAS_VAR_ (FT_Pos)(px >> 32), (FT_Pos)(py >> 32));
 +    }
 +  }
++
 +#else  /* !BEZIER_USE_DDA */
++
 +  /* Note that multiple attempts to speed up the function below
 +   * with SSE2 intrinsics, using various data layouts, have turned
 +   * out to be slower than the non-SIMD code below.
 +   */
    static void
    gray_split_conic( FT_Vector*  base )
+   {
@@ -1070,7 +1252,15 @@ typedef ptrdiff_t  FT_PtrDist;
      } while ( --draw );
+   }
 +#endif  /* !BEZIER_USE_DDA */
 +  /* For cubic bezier, binary splits are still faster than DDA
 +   * because the splits are adaptive to how quickly each sub-arc
 +   * approaches their chord trisection points.
 +   *
 +   * It might be useful to experiment with SSE2 to speed up
 +   * gray_split_cubic() though.
 +   */
    static void
    gray_split_cubic( FT_Vector*  base )
+   {
@@ -1161,7 +1351,6 @@ typedef ptrdiff_t  FT_PtrDist;
+     }
+   }
+-
    static int
    gray_move_to( const FT_Vector*  to,
                  gray_PWorker      worker )

From:	David Turner (@david.freetype)
Subject:	[Git][freetype/freetype][split-conic-dda] [smooth] Implement Bezier quadratic arc flattenning with DDA
Date:	Mon, 12 Jul 2021 08:17:12 +0000