... |
... |
@@ -985,6 +985,188 @@ typedef ptrdiff_t FT_PtrDist; |
985
|
985
|
|
986
|
986
|
#endif
|
987
|
987
|
|
|
988
|
+/* Benchmarking shows that using DDA to flatten the quadratic bezier
|
|
989
|
+ * arcs is slightly faster in the following cases:
|
|
990
|
+ *
|
|
991
|
+ * - When the host CPU is 64-bit.
|
|
992
|
+ * - When SSE2 SIMD registers and instructions are available (even on x86).
|
|
993
|
+ *
|
|
994
|
+ * For other cases, using binary splits is actually slightly faster.
|
|
995
|
+ */
|
|
996
|
+#if defined(__SSE2__) || defined(__x86_64__) || defined(__aarch64__) || defined(_M_AMD64) || defined(_M_ARM64)
|
|
997
|
+#define BEZIER_USE_DDA 1
|
|
998
|
+#else
|
|
999
|
+#define BEZIER_USE_DDA 0
|
|
1000
|
+#endif
|
|
1001
|
+
|
|
1002
|
+#if BEZIER_USE_DDA
|
|
1003
|
+
|
|
1004
|
+#include <emmintrin.h>
|
|
1005
|
+
|
|
1006
|
+ static void
|
|
1007
|
+ gray_render_conic( RAS_ARG_ const FT_Vector* control,
|
|
1008
|
+ const FT_Vector* to )
|
|
1009
|
+ {
|
|
1010
|
+ FT_Vector p0, p1, p2;
|
|
1011
|
+
|
|
1012
|
+ p0.x = ras.x;
|
|
1013
|
+ p0.y = ras.y;
|
|
1014
|
+ p1.x = UPSCALE( control->x );
|
|
1015
|
+ p1.y = UPSCALE( control->y );
|
|
1016
|
+ p2.x = UPSCALE( to->x );
|
|
1017
|
+ p2.y = UPSCALE( to->y );
|
|
1018
|
+
|
|
1019
|
+ /* short-cut the arc that crosses the current band */
|
|
1020
|
+ if ( ( TRUNC( p0.y ) >= ras.max_ey &&
|
|
1021
|
+ TRUNC( p1.y ) >= ras.max_ey &&
|
|
1022
|
+ TRUNC( p2.y ) >= ras.max_ey ) ||
|
|
1023
|
+ ( TRUNC( p0.y ) < ras.min_ey &&
|
|
1024
|
+ TRUNC( p1.y ) < ras.min_ey &&
|
|
1025
|
+ TRUNC( p2.y ) < ras.min_ey ) )
|
|
1026
|
+ {
|
|
1027
|
+ ras.x = p2.x;
|
|
1028
|
+ ras.y = p2.y;
|
|
1029
|
+ return;
|
|
1030
|
+ }
|
|
1031
|
+
|
|
1032
|
+ TPos dx = FT_ABS( p0.x + p2.x - 2 * p1.x );
|
|
1033
|
+ TPos dy = FT_ABS( p0.y + p2.y - 2 * p1.y );
|
|
1034
|
+ if ( dx < dy )
|
|
1035
|
+ dx = dy;
|
|
1036
|
+
|
|
1037
|
+ if ( dx <= ONE_PIXEL / 4 )
|
|
1038
|
+ {
|
|
1039
|
+ gray_render_line( RAS_VAR_ p2.x, p2.y );
|
|
1040
|
+ return;
|
|
1041
|
+ }
|
|
1042
|
+
|
|
1043
|
+ /* We can calculate the number of necessary bisections because */
|
|
1044
|
+ /* each bisection predictably reduces deviation exactly 4-fold. */
|
|
1045
|
+ /* Even 32-bit deviation would vanish after 16 bisections. */
|
|
1046
|
+ int shift = 0;
|
|
1047
|
+ do
|
|
1048
|
+ {
|
|
1049
|
+ dx >>= 2;
|
|
1050
|
+ shift += 1;
|
|
1051
|
+ }
|
|
1052
|
+ while (dx > ONE_PIXEL / 4);
|
|
1053
|
+
|
|
1054
|
+ /*
|
|
1055
|
+ * The (P0,P1,P2) arc equation, for t in [0,1] range:
|
|
1056
|
+ *
|
|
1057
|
+ * P(t) = P0*(1-t)^2 + P1*2*t*(1-t) + P2*t^2
|
|
1058
|
+ *
|
|
1059
|
+ * P(t) = P0 + 2*(P1-P0)*t + (P0+P2-2*P1)*t^2
|
|
1060
|
+ * = P0 + 2*B*t + A*t^2
|
|
1061
|
+ *
|
|
1062
|
+ * for A = P0 + P2 - 2*P1
|
|
1063
|
+ * and B = P1 - P0
|
|
1064
|
+ *
|
|
1065
|
+ * Let's consider the difference when advancing by a small
|
|
1066
|
+ * parameter h:
|
|
1067
|
+ *
|
|
1068
|
+ * Q(h,t) = P(t+h) - P(t) = 2*B*h + A*h^2 + 2*A*h*t
|
|
1069
|
+ *
|
|
1070
|
+ * And then its own difference:
|
|
1071
|
+ *
|
|
1072
|
+ * R(h,t) = Q(h,t+h) - Q(h,t) = 2*A*h*h = R (constant)
|
|
1073
|
+ *
|
|
1074
|
+ * Since R is always a constant, it is possible to compute
|
|
1075
|
+ * successive positions with:
|
|
1076
|
+ *
|
|
1077
|
+ * P = P0
|
|
1078
|
+ * Q = Q(h,0) = 2*B*h + A*h*h
|
|
1079
|
+ * R = 2*A*h*h
|
|
1080
|
+ *
|
|
1081
|
+ * loop:
|
|
1082
|
+ * P += Q
|
|
1083
|
+ * Q += R
|
|
1084
|
+ * EMIT(P)
|
|
1085
|
+ *
|
|
1086
|
+ * To ensure accurate results, perform computations on 64-bit
|
|
1087
|
+ * values, after scaling them by 2^32:
|
|
1088
|
+ *
|
|
1089
|
+ * R << 32 = 2 * A << (32 - N - N)
|
|
1090
|
+ * = A << (33 - 2 *N)
|
|
1091
|
+ *
|
|
1092
|
+ * Q << 32 = (2 * B << (32 - N)) + (A << (32 - N - N))
|
|
1093
|
+ * = (B << (33 - N)) + (A << (32 - N - N))
|
|
1094
|
+ */
|
|
1095
|
+#ifdef __SSE2__
|
|
1096
|
+ /* Experience shows that for small shift values, SSE2 is actually slower. */
|
|
1097
|
+ if (shift > 2) {
|
|
1098
|
+ union {
|
|
1099
|
+ struct { FT_Int64 ax, ay, bx, by; } i;
|
|
1100
|
+ struct { __m128i a, b; } vec;
|
|
1101
|
+ } u;
|
|
1102
|
+
|
|
1103
|
+ u.i.ax = p0.x + p2.x - 2 * p1.x;
|
|
1104
|
+ u.i.ay = p0.y + p2.y - 2 * p1.y;
|
|
1105
|
+ u.i.bx = p1.x - p0.x;
|
|
1106
|
+ u.i.by = p1.y - p0.y;
|
|
1107
|
+
|
|
1108
|
+ __m128i a = _mm_load_si128(&u.vec.a);
|
|
1109
|
+ __m128i b = _mm_load_si128(&u.vec.b);
|
|
1110
|
+
|
|
1111
|
+ __m128i r = _mm_slli_epi64(a, 33 - 2 * shift);
|
|
1112
|
+ __m128i q = _mm_slli_epi64(b, 33 - shift);
|
|
1113
|
+ __m128i q2 = _mm_slli_epi64(a, 32 - 2 * shift);
|
|
1114
|
+ q = _mm_add_epi64(q2, q);
|
|
1115
|
+
|
|
1116
|
+ union {
|
|
1117
|
+ struct { FT_Int32 px_lo, px_hi, py_lo, py_hi; } i;
|
|
1118
|
+ __m128i vec;
|
|
1119
|
+ } v;
|
|
1120
|
+ v.i.px_lo = 0;
|
|
1121
|
+ v.i.px_hi = p0.x;
|
|
1122
|
+ v.i.py_lo = 0;
|
|
1123
|
+ v.i.py_hi = p0.y;
|
|
1124
|
+
|
|
1125
|
+ __m128i p = _mm_load_si128(&v.vec);
|
|
1126
|
+
|
|
1127
|
+ for (unsigned count = (1u << shift); count > 0; count--) {
|
|
1128
|
+ p = _mm_add_epi64(p, q);
|
|
1129
|
+ q = _mm_add_epi64(q, r);
|
|
1130
|
+
|
|
1131
|
+ _mm_store_si128(&v.vec, p);
|
|
1132
|
+
|
|
1133
|
+ gray_render_line( RAS_VAR_ v.i.px_hi, v.i.py_hi);
|
|
1134
|
+ }
|
|
1135
|
+ return;
|
|
1136
|
+ }
|
|
1137
|
+#endif /* !__SSE2__ */
|
|
1138
|
+ FT_Int64 ax = p0.x + p2.x - 2 * p1.x;
|
|
1139
|
+ FT_Int64 ay = p0.y + p2.y - 2 * p1.y;
|
|
1140
|
+ FT_Int64 bx = p1.x - p0.x;
|
|
1141
|
+ FT_Int64 by = p1.y - p0.y;
|
|
1142
|
+
|
|
1143
|
+ FT_Int64 rx = ax << (33 - 2 * shift);
|
|
1144
|
+ FT_Int64 ry = ay << (33 - 2 * shift);
|
|
1145
|
+
|
|
1146
|
+ FT_Int64 qx = (bx << (33 - shift)) + (ax << (32 - 2 * shift));
|
|
1147
|
+ FT_Int64 qy = (by << (33 - shift)) + (ay << (32 - 2 * shift));
|
|
1148
|
+
|
|
1149
|
+ FT_Int64 px = (FT_Int64)p0.x << 32;
|
|
1150
|
+ FT_Int64 py = (FT_Int64)p0.y << 32;
|
|
1151
|
+
|
|
1152
|
+ FT_UInt count = 1u << shift;
|
|
1153
|
+
|
|
1154
|
+ for (; count > 0; count--) {
|
|
1155
|
+ px += qx;
|
|
1156
|
+ py += qy;
|
|
1157
|
+ qx += rx;
|
|
1158
|
+ qy += ry;
|
|
1159
|
+
|
|
1160
|
+ gray_render_line( RAS_VAR_ (FT_Pos)(px >> 32), (FT_Pos)(py >> 32));
|
|
1161
|
+ }
|
|
1162
|
+ }
|
|
1163
|
+
|
|
1164
|
+#else /* !BEZIER_USE_DDA */
|
|
1165
|
+
|
|
1166
|
+ /* Note that multiple attempts to speed up the function below
|
|
1167
|
+ * with SSE2 intrinsics, using various data layouts, have turned
|
|
1168
|
+ * out to be slower than the non-SIMD code below.
|
|
1169
|
+ */
|
988
|
1170
|
static void
|
989
|
1171
|
gray_split_conic( FT_Vector* base )
|
990
|
1172
|
{
|
... |
... |
@@ -1070,7 +1252,15 @@ typedef ptrdiff_t FT_PtrDist; |
1070
|
1252
|
} while ( --draw );
|
1071
|
1253
|
}
|
1072
|
1254
|
|
|
1255
|
+#endif /* !BEZIER_USE_DDA */
|
1073
|
1256
|
|
|
1257
|
+ /* For cubic bezier, binary splits are still faster than DDA
|
|
1258
|
+ * because the splits are adaptive to how quickly each sub-arc
|
|
1259
|
+ * approaches their chord trisection points.
|
|
1260
|
+ *
|
|
1261
|
+ * It might be useful to experiment with SSE2 to speed up
|
|
1262
|
+ * gray_split_cubic() though.
|
|
1263
|
+ */
|
1074
|
1264
|
static void
|
1075
|
1265
|
gray_split_cubic( FT_Vector* base )
|
1076
|
1266
|
{
|
... |
... |
@@ -1161,7 +1351,6 @@ typedef ptrdiff_t FT_PtrDist; |
1161
|
1351
|
}
|
1162
|
1352
|
}
|
1163
|
1353
|
|
1164
|
|
-
|
1165
|
1354
|
static int
|
1166
|
1355
|
gray_move_to( const FT_Vector* to,
|
1167
|
1356
|
gray_PWorker worker )
|