freetype-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Git][freetype/freetype][split-conic-dda] [smooth] Implement Bezier quad


From: David Turner (@david.freetype)
Subject: [Git][freetype/freetype][split-conic-dda] [smooth] Implement Bezier quadratic arc flattenning with DDA
Date: Mon, 12 Jul 2021 08:17:12 +0000

David Turner pushed to branch split-conic-dda at FreeType / FreeType

Commits:

2 changed files:

Changes:

  • ChangeLog
    1
    +2021-07-12  David Turner  <david@freetype.org>
    
    2
    +
    
    3
    +	[smooth] Implement Bezier quadratic arc flattenning with DDA
    
    4
    +
    
    5
    +	Benchmarking shows that this provides a very slighty performance
    
    6
    +	boost when rendering fonts with lots of quadratic bezier arcs,
    
    7
    +	compared to the recursive arc splitting, but only when SSE2 is
    
    8
    +	available, or on 64-bit CPUs.
    
    9
    +
    
    10
    +	* src/smooth/ftgrays.c (gray_render_conic): New implementation
    
    11
    +	based on DDA and optionally SSE2.
    
    12
    +
    
    1 13
     2021-07-12  David Turner  <david@freetype.org>
    
    2 14
     
    
    3 15
     	[smooth] Minor speedup to smooth rasterizer
    

  • src/smooth/ftgrays.c
    ... ... @@ -985,6 +985,188 @@ typedef ptrdiff_t FT_PtrDist;
    985 985
     
    
    986 986
     #endif
    
    987 987
     
    
    988
    +/* Benchmarking shows that using DDA to flatten the quadratic bezier
    
    989
    + * arcs is slightly faster in the following cases:
    
    990
    + *
    
    991
    + *   - When the host CPU is 64-bit.
    
    992
    + *   - When SSE2 SIMD registers and instructions are available (even on x86).
    
    993
    + *
    
    994
    + * For other cases, using binary splits is actually slightly faster.
    
    995
    + */
    
    996
    +#if defined(__SSE2__) || defined(__x86_64__) || defined(__aarch64__) || defined(_M_AMD64) || defined(_M_ARM64)
    
    997
    +#define BEZIER_USE_DDA  1
    
    998
    +#else
    
    999
    +#define BEZIER_USE_DDA  0
    
    1000
    +#endif
    
    1001
    +
    
    1002
    +#if BEZIER_USE_DDA
    
    1003
    +
    
    1004
    +#include <emmintrin.h>
    
    1005
    +
    
    1006
    +  static void
    
    1007
    +  gray_render_conic( RAS_ARG_ const FT_Vector*  control,
    
    1008
    +                              const FT_Vector*  to )
    
    1009
    +  {
    
    1010
    +    FT_Vector  p0, p1, p2;
    
    1011
    +
    
    1012
    +    p0.x = ras.x;
    
    1013
    +    p0.y = ras.y;
    
    1014
    +    p1.x = UPSCALE( control->x );
    
    1015
    +    p1.y = UPSCALE( control->y );
    
    1016
    +    p2.x = UPSCALE( to->x );
    
    1017
    +    p2.y = UPSCALE( to->y );
    
    1018
    +
    
    1019
    +    /* short-cut the arc that crosses the current band */
    
    1020
    +    if ( ( TRUNC( p0.y ) >= ras.max_ey &&
    
    1021
    +           TRUNC( p1.y ) >= ras.max_ey &&
    
    1022
    +           TRUNC( p2.y ) >= ras.max_ey ) ||
    
    1023
    +         ( TRUNC( p0.y ) <  ras.min_ey &&
    
    1024
    +           TRUNC( p1.y ) <  ras.min_ey &&
    
    1025
    +           TRUNC( p2.y ) <  ras.min_ey ) )
    
    1026
    +    {
    
    1027
    +      ras.x = p2.x;
    
    1028
    +      ras.y = p2.y;
    
    1029
    +      return;
    
    1030
    +    }
    
    1031
    +
    
    1032
    +    TPos dx = FT_ABS( p0.x + p2.x - 2 * p1.x );
    
    1033
    +    TPos dy = FT_ABS( p0.y + p2.y - 2 * p1.y );
    
    1034
    +    if ( dx < dy )
    
    1035
    +      dx = dy;
    
    1036
    +
    
    1037
    +    if ( dx <= ONE_PIXEL / 4 )
    
    1038
    +    {
    
    1039
    +      gray_render_line( RAS_VAR_ p2.x, p2.y );
    
    1040
    +      return;
    
    1041
    +    }
    
    1042
    +
    
    1043
    +    /* We can calculate the number of necessary bisections because  */
    
    1044
    +    /* each bisection predictably reduces deviation exactly 4-fold. */
    
    1045
    +    /* Even 32-bit deviation would vanish after 16 bisections.      */
    
    1046
    +    int shift = 0;
    
    1047
    +    do
    
    1048
    +    {
    
    1049
    +      dx   >>= 2;
    
    1050
    +      shift += 1;
    
    1051
    +    }
    
    1052
    +    while (dx > ONE_PIXEL / 4);
    
    1053
    +
    
    1054
    +    /*
    
    1055
    +     * The (P0,P1,P2) arc equation, for t in [0,1] range:
    
    1056
    +     *
    
    1057
    +     * P(t) = P0*(1-t)^2 + P1*2*t*(1-t) + P2*t^2
    
    1058
    +     *
    
    1059
    +     * P(t) = P0 + 2*(P1-P0)*t + (P0+P2-2*P1)*t^2
    
    1060
    +     *      = P0 + 2*B*t + A*t^2
    
    1061
    +     *
    
    1062
    +     *    for A = P0 + P2 - 2*P1
    
    1063
    +     *    and B = P1 - P0
    
    1064
    +     *
    
    1065
    +     * Let's consider the difference when advancing by a small
    
    1066
    +     * parameter h:
    
    1067
    +     *
    
    1068
    +     *    Q(h,t) = P(t+h) - P(t) = 2*B*h + A*h^2 + 2*A*h*t
    
    1069
    +     *
    
    1070
    +     * And then its own difference:
    
    1071
    +     *
    
    1072
    +     *    R(h,t) = Q(h,t+h) - Q(h,t) = 2*A*h*h = R (constant)
    
    1073
    +     *
    
    1074
    +     * Since R is always a constant, it is possible to compute
    
    1075
    +     * successive positions with:
    
    1076
    +     *
    
    1077
    +     *     P = P0
    
    1078
    +     *     Q = Q(h,0) = 2*B*h + A*h*h
    
    1079
    +     *     R = 2*A*h*h
    
    1080
    +     *
    
    1081
    +     *   loop:
    
    1082
    +     *     P += Q
    
    1083
    +     *     Q += R
    
    1084
    +     *     EMIT(P)
    
    1085
    +     *
    
    1086
    +     * To ensure accurate results, perform computations on 64-bit
    
    1087
    +     * values, after scaling them by 2^32:
    
    1088
    +     *
    
    1089
    +     *     R << 32   = 2 * A << (32 - N - N)
    
    1090
    +     *               = A << (33 - 2 *N)
    
    1091
    +     *
    
    1092
    +     *     Q << 32   = (2 * B << (32 - N)) + (A << (32 - N - N))
    
    1093
    +     *               = (B << (33 - N)) + (A << (32 - N - N))
    
    1094
    +     */
    
    1095
    +#ifdef __SSE2__
    
    1096
    +    /* Experience shows that for small shift values, SSE2 is actually slower. */
    
    1097
    +    if (shift > 2) {
    
    1098
    +      union {
    
    1099
    +        struct { FT_Int64 ax, ay, bx, by; } i;
    
    1100
    +        struct { __m128i a, b; } vec;
    
    1101
    +      } u;
    
    1102
    +
    
    1103
    +      u.i.ax = p0.x + p2.x - 2 * p1.x;
    
    1104
    +      u.i.ay = p0.y + p2.y - 2 * p1.y;
    
    1105
    +      u.i.bx = p1.x - p0.x;
    
    1106
    +      u.i.by = p1.y - p0.y;
    
    1107
    +
    
    1108
    +      __m128i a = _mm_load_si128(&u.vec.a);
    
    1109
    +      __m128i b = _mm_load_si128(&u.vec.b);
    
    1110
    +
    
    1111
    +      __m128i r = _mm_slli_epi64(a, 33 - 2 * shift);
    
    1112
    +      __m128i q = _mm_slli_epi64(b, 33 - shift);
    
    1113
    +      __m128i q2 = _mm_slli_epi64(a, 32 - 2 * shift);
    
    1114
    +      q = _mm_add_epi64(q2, q);
    
    1115
    +
    
    1116
    +      union {
    
    1117
    +        struct { FT_Int32  px_lo, px_hi, py_lo, py_hi; } i;
    
    1118
    +        __m128i vec;
    
    1119
    +      } v;
    
    1120
    +      v.i.px_lo = 0;
    
    1121
    +      v.i.px_hi = p0.x;
    
    1122
    +      v.i.py_lo = 0;
    
    1123
    +      v.i.py_hi = p0.y;
    
    1124
    +
    
    1125
    +      __m128i p = _mm_load_si128(&v.vec);
    
    1126
    +
    
    1127
    +      for (unsigned count = (1u << shift); count > 0; count--) {
    
    1128
    +        p = _mm_add_epi64(p, q);
    
    1129
    +        q = _mm_add_epi64(q, r);
    
    1130
    +
    
    1131
    +        _mm_store_si128(&v.vec, p);
    
    1132
    +
    
    1133
    +        gray_render_line( RAS_VAR_ v.i.px_hi, v.i.py_hi);
    
    1134
    +      }
    
    1135
    +      return;
    
    1136
    +    }
    
    1137
    +#endif  /* !__SSE2__ */
    
    1138
    +    FT_Int64 ax = p0.x + p2.x - 2 * p1.x;
    
    1139
    +    FT_Int64 ay = p0.y + p2.y - 2 * p1.y;
    
    1140
    +    FT_Int64 bx = p1.x - p0.x;
    
    1141
    +    FT_Int64 by = p1.y - p0.y;
    
    1142
    +
    
    1143
    +    FT_Int64 rx = ax << (33 - 2 * shift);
    
    1144
    +    FT_Int64 ry = ay << (33 - 2 * shift);
    
    1145
    +
    
    1146
    +    FT_Int64 qx = (bx << (33 - shift)) + (ax << (32 - 2 * shift));
    
    1147
    +    FT_Int64 qy = (by << (33 - shift)) + (ay << (32 - 2 * shift));
    
    1148
    +
    
    1149
    +    FT_Int64 px = (FT_Int64)p0.x << 32;
    
    1150
    +    FT_Int64 py = (FT_Int64)p0.y << 32;
    
    1151
    +
    
    1152
    +	FT_UInt count = 1u << shift;
    
    1153
    +
    
    1154
    +    for (; count > 0; count--) {
    
    1155
    +      px += qx;
    
    1156
    +      py += qy;
    
    1157
    +      qx += rx;
    
    1158
    +      qy += ry;
    
    1159
    +
    
    1160
    +      gray_render_line( RAS_VAR_ (FT_Pos)(px >> 32), (FT_Pos)(py >> 32));
    
    1161
    +    }
    
    1162
    +  }
    
    1163
    +
    
    1164
    +#else  /* !BEZIER_USE_DDA */
    
    1165
    +
    
    1166
    +  /* Note that multiple attempts to speed up the function below
    
    1167
    +   * with SSE2 intrinsics, using various data layouts, have turned
    
    1168
    +   * out to be slower than the non-SIMD code below.
    
    1169
    +   */
    
    988 1170
       static void
    
    989 1171
       gray_split_conic( FT_Vector*  base )
    
    990 1172
       {
    
    ... ... @@ -1070,7 +1252,15 @@ typedef ptrdiff_t FT_PtrDist;
    1070 1252
         } while ( --draw );
    
    1071 1253
       }
    
    1072 1254
     
    
    1255
    +#endif  /* !BEZIER_USE_DDA */
    
    1073 1256
     
    
    1257
    +  /* For cubic bezier, binary splits are still faster than DDA
    
    1258
    +   * because the splits are adaptive to how quickly each sub-arc
    
    1259
    +   * approaches their chord trisection points.
    
    1260
    +   *
    
    1261
    +   * It might be useful to experiment with SSE2 to speed up
    
    1262
    +   * gray_split_cubic() though.
    
    1263
    +   */
    
    1074 1264
       static void
    
    1075 1265
       gray_split_cubic( FT_Vector*  base )
    
    1076 1266
       {
    
    ... ... @@ -1161,7 +1351,6 @@ typedef ptrdiff_t FT_PtrDist;
    1161 1351
         }
    
    1162 1352
       }
    
    1163 1353
     
    
    1164
    -
    
    1165 1354
       static int
    
    1166 1355
       gray_move_to( const FT_Vector*  to,
    
    1167 1356
                     gray_PWorker      worker )
    


  • reply via email to

    [Prev in Thread] Current Thread [Next in Thread]