From 56cc2ad446e920164e823e78a21972cadb339bfc Mon Sep 17 00:00:00 2001 From: David Turner Date: Sat, 19 Jun 2021 10:26:53 +0200 Subject: [PATCH] [smooth] Implement Bezier quadratic arc flattenning with DDA Benchmarking shows that this provides a very slighty performance boost when rendering fonts with lots of quadratic bezier arcs, compared to the recursive arc splitting, but only when SSE2 is available, or on 64-bit CPUs. On a 2017 Core i5-7300U CPU on Linux/x86_64: ./ftbench -p -s10 -t5 -cb .../DroidSansFallbackFull.ttf Before: 4.033 us/op (best of 5 runs for all numbers) After: 3.876 us/op ./ftbench -p -s60 -t5 -cb .../DroidSansFallbackFull.ttf Before: 13.467 us/op After: 13.385 us/op --- ChangeLog | 12 +++ src/smooth/ftgrays.c | 191 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 202 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index ea269da30..a9d38aa7f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +2021-07-15 David Turner + + [smooth] Implement Bezier quadratic arc flattenning with DDA + + Benchmarking shows that this provides a very slighty performance + boost when rendering fonts with lots of quadratic bezier arcs, + compared to the recursive arc splitting, but only when SSE2 is + available, or on 64-bit CPUs. + + * src/smooth/ftgrays.c (gray_render_conic): New implementation + based on DDA and optionally SSE2. + 2021-07-15 David Turner [smooth] Minor speedup to smooth rasterizer diff --git a/src/smooth/ftgrays.c b/src/smooth/ftgrays.c index e66ec34a7..7158cd21c 100644 --- a/src/smooth/ftgrays.c +++ b/src/smooth/ftgrays.c @@ -993,6 +993,188 @@ typedef ptrdiff_t FT_PtrDist; #endif +/* Benchmarking shows that using DDA to flatten the quadratic bezier + * arcs is slightly faster in the following cases: + * + * - When the host CPU is 64-bit. + * - When SSE2 SIMD registers and instructions are available (even on x86). + * + * For other cases, using binary splits is actually slightly faster. + */ +#if defined(__SSE2__) || defined(__x86_64__) || defined(__aarch64__) || defined(_M_AMD64) || defined(_M_ARM64) +#define BEZIER_USE_DDA 1 +#else +#define BEZIER_USE_DDA 0 +#endif + +#if BEZIER_USE_DDA + +#include + + static void + gray_render_conic( RAS_ARG_ const FT_Vector* control, + const FT_Vector* to ) + { + FT_Vector p0, p1, p2; + + p0.x = ras.x; + p0.y = ras.y; + p1.x = UPSCALE( control->x ); + p1.y = UPSCALE( control->y ); + p2.x = UPSCALE( to->x ); + p2.y = UPSCALE( to->y ); + + /* short-cut the arc that crosses the current band */ + if ( ( TRUNC( p0.y ) >= ras.max_ey && + TRUNC( p1.y ) >= ras.max_ey && + TRUNC( p2.y ) >= ras.max_ey ) || + ( TRUNC( p0.y ) < ras.min_ey && + TRUNC( p1.y ) < ras.min_ey && + TRUNC( p2.y ) < ras.min_ey ) ) + { + ras.x = p2.x; + ras.y = p2.y; + return; + } + + TPos dx = FT_ABS( p0.x + p2.x - 2 * p1.x ); + TPos dy = FT_ABS( p0.y + p2.y - 2 * p1.y ); + if ( dx < dy ) + dx = dy; + + if ( dx <= ONE_PIXEL / 4 ) + { + gray_render_line( RAS_VAR_ p2.x, p2.y ); + return; + } + + /* We can calculate the number of necessary bisections because */ + /* each bisection predictably reduces deviation exactly 4-fold. */ + /* Even 32-bit deviation would vanish after 16 bisections. */ + int shift = 0; + do + { + dx >>= 2; + shift += 1; + } + while (dx > ONE_PIXEL / 4); + + /* + * The (P0,P1,P2) arc equation, for t in [0,1] range: + * + * P(t) = P0*(1-t)^2 + P1*2*t*(1-t) + P2*t^2 + * + * P(t) = P0 + 2*(P1-P0)*t + (P0+P2-2*P1)*t^2 + * = P0 + 2*B*t + A*t^2 + * + * for A = P0 + P2 - 2*P1 + * and B = P1 - P0 + * + * Let's consider the difference when advancing by a small + * parameter h: + * + * Q(h,t) = P(t+h) - P(t) = 2*B*h + A*h^2 + 2*A*h*t + * + * And then its own difference: + * + * R(h,t) = Q(h,t+h) - Q(h,t) = 2*A*h*h = R (constant) + * + * Since R is always a constant, it is possible to compute + * successive positions with: + * + * P = P0 + * Q = Q(h,0) = 2*B*h + A*h*h + * R = 2*A*h*h + * + * loop: + * P += Q + * Q += R + * EMIT(P) + * + * To ensure accurate results, perform computations on 64-bit + * values, after scaling them by 2^32: + * + * R << 32 = 2 * A << (32 - N - N) + * = A << (33 - 2 *N) + * + * Q << 32 = (2 * B << (32 - N)) + (A << (32 - N - N)) + * = (B << (33 - N)) + (A << (32 - N - N)) + */ +#ifdef __SSE2__ + /* Experience shows that for small shift values, SSE2 is actually slower. */ + if (shift > 2) { + union { + struct { FT_Int64 ax, ay, bx, by; } i; + struct { __m128i a, b; } vec; + } u; + + u.i.ax = p0.x + p2.x - 2 * p1.x; + u.i.ay = p0.y + p2.y - 2 * p1.y; + u.i.bx = p1.x - p0.x; + u.i.by = p1.y - p0.y; + + __m128i a = _mm_load_si128(&u.vec.a); + __m128i b = _mm_load_si128(&u.vec.b); + + __m128i r = _mm_slli_epi64(a, 33 - 2 * shift); + __m128i q = _mm_slli_epi64(b, 33 - shift); + __m128i q2 = _mm_slli_epi64(a, 32 - 2 * shift); + q = _mm_add_epi64(q2, q); + + union { + struct { FT_Int32 px_lo, px_hi, py_lo, py_hi; } i; + __m128i vec; + } v; + v.i.px_lo = 0; + v.i.px_hi = p0.x; + v.i.py_lo = 0; + v.i.py_hi = p0.y; + + __m128i p = _mm_load_si128(&v.vec); + + for (unsigned count = (1u << shift); count > 0; count--) { + p = _mm_add_epi64(p, q); + q = _mm_add_epi64(q, r); + + _mm_store_si128(&v.vec, p); + + gray_render_line( RAS_VAR_ v.i.px_hi, v.i.py_hi); + } + return; + } +#endif /* !__SSE2__ */ + FT_Int64 ax = p0.x + p2.x - 2 * p1.x; + FT_Int64 ay = p0.y + p2.y - 2 * p1.y; + FT_Int64 bx = p1.x - p0.x; + FT_Int64 by = p1.y - p0.y; + + FT_Int64 rx = ax << (33 - 2 * shift); + FT_Int64 ry = ay << (33 - 2 * shift); + + FT_Int64 qx = (bx << (33 - shift)) + (ax << (32 - 2 * shift)); + FT_Int64 qy = (by << (33 - shift)) + (ay << (32 - 2 * shift)); + + FT_Int64 px = (FT_Int64)p0.x << 32; + FT_Int64 py = (FT_Int64)p0.y << 32; + + FT_UInt count = 1u << shift; + + for (; count > 0; count--) { + px += qx; + py += qy; + qx += rx; + qy += ry; + + gray_render_line( RAS_VAR_ (FT_Pos)(px >> 32), (FT_Pos)(py >> 32)); + } + } + +#else /* !BEZIER_USE_DDA */ + + /* Note that multiple attempts to speed up the function below + * with SSE2 intrinsics, using various data layouts, have turned + * out to be slower than the non-SIMD code below. + */ static void gray_split_conic( FT_Vector* base ) { @@ -1078,7 +1260,15 @@ typedef ptrdiff_t FT_PtrDist; } while ( --draw ); } +#endif /* !BEZIER_USE_DDA */ + /* For cubic bezier, binary splits are still faster than DDA + * because the splits are adaptive to how quickly each sub-arc + * approaches their chord trisection points. + * + * It might be useful to experiment with SSE2 to speed up + * gray_split_cubic() though. + */ static void gray_split_cubic( FT_Vector* base ) { @@ -1169,7 +1359,6 @@ typedef ptrdiff_t FT_PtrDist; } } - static int gray_move_to( const FT_Vector* to, gray_PWorker worker )