[smooth] Implement Bezier quadratic arc flattenning with DDA
Benchmarking shows that this provides a very slighty performance boost when rendering fonts with lots of quadratic bezier arcs, compared to the recursive arc splitting, but only when SSE2 is available, or on 64-bit CPUs. On a 2017 Core i5-7300U CPU on Linux/x86_64: ./ftbench -p -s10 -t5 -cb .../DroidSansFallbackFull.ttf Before: 4.033 us/op (best of 5 runs for all numbers) After: 3.876 us/op ./ftbench -p -s60 -t5 -cb .../DroidSansFallbackFull.ttf Before: 13.467 us/op After: 13.385 us/op
This commit is contained in:
parent
86b9c9347f
commit
56cc2ad446
12
ChangeLog
12
ChangeLog
|
@ -1,3 +1,15 @@
|
|||
2021-07-15 David Turner <david@freetype.org>
|
||||
|
||||
[smooth] Implement Bezier quadratic arc flattenning with DDA
|
||||
|
||||
Benchmarking shows that this provides a very slighty performance
|
||||
boost when rendering fonts with lots of quadratic bezier arcs,
|
||||
compared to the recursive arc splitting, but only when SSE2 is
|
||||
available, or on 64-bit CPUs.
|
||||
|
||||
* src/smooth/ftgrays.c (gray_render_conic): New implementation
|
||||
based on DDA and optionally SSE2.
|
||||
|
||||
2021-07-15 David Turner <david@freetype.org>
|
||||
|
||||
[smooth] Minor speedup to smooth rasterizer
|
||||
|
|
|
@ -993,6 +993,188 @@ typedef ptrdiff_t FT_PtrDist;
|
|||
|
||||
#endif
|
||||
|
||||
/* Benchmarking shows that using DDA to flatten the quadratic bezier
|
||||
* arcs is slightly faster in the following cases:
|
||||
*
|
||||
* - When the host CPU is 64-bit.
|
||||
* - When SSE2 SIMD registers and instructions are available (even on x86).
|
||||
*
|
||||
* For other cases, using binary splits is actually slightly faster.
|
||||
*/
|
||||
#if defined(__SSE2__) || defined(__x86_64__) || defined(__aarch64__) || defined(_M_AMD64) || defined(_M_ARM64)
|
||||
#define BEZIER_USE_DDA 1
|
||||
#else
|
||||
#define BEZIER_USE_DDA 0
|
||||
#endif
|
||||
|
||||
#if BEZIER_USE_DDA
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
static void
|
||||
gray_render_conic( RAS_ARG_ const FT_Vector* control,
|
||||
const FT_Vector* to )
|
||||
{
|
||||
FT_Vector p0, p1, p2;
|
||||
|
||||
p0.x = ras.x;
|
||||
p0.y = ras.y;
|
||||
p1.x = UPSCALE( control->x );
|
||||
p1.y = UPSCALE( control->y );
|
||||
p2.x = UPSCALE( to->x );
|
||||
p2.y = UPSCALE( to->y );
|
||||
|
||||
/* short-cut the arc that crosses the current band */
|
||||
if ( ( TRUNC( p0.y ) >= ras.max_ey &&
|
||||
TRUNC( p1.y ) >= ras.max_ey &&
|
||||
TRUNC( p2.y ) >= ras.max_ey ) ||
|
||||
( TRUNC( p0.y ) < ras.min_ey &&
|
||||
TRUNC( p1.y ) < ras.min_ey &&
|
||||
TRUNC( p2.y ) < ras.min_ey ) )
|
||||
{
|
||||
ras.x = p2.x;
|
||||
ras.y = p2.y;
|
||||
return;
|
||||
}
|
||||
|
||||
TPos dx = FT_ABS( p0.x + p2.x - 2 * p1.x );
|
||||
TPos dy = FT_ABS( p0.y + p2.y - 2 * p1.y );
|
||||
if ( dx < dy )
|
||||
dx = dy;
|
||||
|
||||
if ( dx <= ONE_PIXEL / 4 )
|
||||
{
|
||||
gray_render_line( RAS_VAR_ p2.x, p2.y );
|
||||
return;
|
||||
}
|
||||
|
||||
/* We can calculate the number of necessary bisections because */
|
||||
/* each bisection predictably reduces deviation exactly 4-fold. */
|
||||
/* Even 32-bit deviation would vanish after 16 bisections. */
|
||||
int shift = 0;
|
||||
do
|
||||
{
|
||||
dx >>= 2;
|
||||
shift += 1;
|
||||
}
|
||||
while (dx > ONE_PIXEL / 4);
|
||||
|
||||
/*
|
||||
* The (P0,P1,P2) arc equation, for t in [0,1] range:
|
||||
*
|
||||
* P(t) = P0*(1-t)^2 + P1*2*t*(1-t) + P2*t^2
|
||||
*
|
||||
* P(t) = P0 + 2*(P1-P0)*t + (P0+P2-2*P1)*t^2
|
||||
* = P0 + 2*B*t + A*t^2
|
||||
*
|
||||
* for A = P0 + P2 - 2*P1
|
||||
* and B = P1 - P0
|
||||
*
|
||||
* Let's consider the difference when advancing by a small
|
||||
* parameter h:
|
||||
*
|
||||
* Q(h,t) = P(t+h) - P(t) = 2*B*h + A*h^2 + 2*A*h*t
|
||||
*
|
||||
* And then its own difference:
|
||||
*
|
||||
* R(h,t) = Q(h,t+h) - Q(h,t) = 2*A*h*h = R (constant)
|
||||
*
|
||||
* Since R is always a constant, it is possible to compute
|
||||
* successive positions with:
|
||||
*
|
||||
* P = P0
|
||||
* Q = Q(h,0) = 2*B*h + A*h*h
|
||||
* R = 2*A*h*h
|
||||
*
|
||||
* loop:
|
||||
* P += Q
|
||||
* Q += R
|
||||
* EMIT(P)
|
||||
*
|
||||
* To ensure accurate results, perform computations on 64-bit
|
||||
* values, after scaling them by 2^32:
|
||||
*
|
||||
* R << 32 = 2 * A << (32 - N - N)
|
||||
* = A << (33 - 2 *N)
|
||||
*
|
||||
* Q << 32 = (2 * B << (32 - N)) + (A << (32 - N - N))
|
||||
* = (B << (33 - N)) + (A << (32 - N - N))
|
||||
*/
|
||||
#ifdef __SSE2__
|
||||
/* Experience shows that for small shift values, SSE2 is actually slower. */
|
||||
if (shift > 2) {
|
||||
union {
|
||||
struct { FT_Int64 ax, ay, bx, by; } i;
|
||||
struct { __m128i a, b; } vec;
|
||||
} u;
|
||||
|
||||
u.i.ax = p0.x + p2.x - 2 * p1.x;
|
||||
u.i.ay = p0.y + p2.y - 2 * p1.y;
|
||||
u.i.bx = p1.x - p0.x;
|
||||
u.i.by = p1.y - p0.y;
|
||||
|
||||
__m128i a = _mm_load_si128(&u.vec.a);
|
||||
__m128i b = _mm_load_si128(&u.vec.b);
|
||||
|
||||
__m128i r = _mm_slli_epi64(a, 33 - 2 * shift);
|
||||
__m128i q = _mm_slli_epi64(b, 33 - shift);
|
||||
__m128i q2 = _mm_slli_epi64(a, 32 - 2 * shift);
|
||||
q = _mm_add_epi64(q2, q);
|
||||
|
||||
union {
|
||||
struct { FT_Int32 px_lo, px_hi, py_lo, py_hi; } i;
|
||||
__m128i vec;
|
||||
} v;
|
||||
v.i.px_lo = 0;
|
||||
v.i.px_hi = p0.x;
|
||||
v.i.py_lo = 0;
|
||||
v.i.py_hi = p0.y;
|
||||
|
||||
__m128i p = _mm_load_si128(&v.vec);
|
||||
|
||||
for (unsigned count = (1u << shift); count > 0; count--) {
|
||||
p = _mm_add_epi64(p, q);
|
||||
q = _mm_add_epi64(q, r);
|
||||
|
||||
_mm_store_si128(&v.vec, p);
|
||||
|
||||
gray_render_line( RAS_VAR_ v.i.px_hi, v.i.py_hi);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif /* !__SSE2__ */
|
||||
FT_Int64 ax = p0.x + p2.x - 2 * p1.x;
|
||||
FT_Int64 ay = p0.y + p2.y - 2 * p1.y;
|
||||
FT_Int64 bx = p1.x - p0.x;
|
||||
FT_Int64 by = p1.y - p0.y;
|
||||
|
||||
FT_Int64 rx = ax << (33 - 2 * shift);
|
||||
FT_Int64 ry = ay << (33 - 2 * shift);
|
||||
|
||||
FT_Int64 qx = (bx << (33 - shift)) + (ax << (32 - 2 * shift));
|
||||
FT_Int64 qy = (by << (33 - shift)) + (ay << (32 - 2 * shift));
|
||||
|
||||
FT_Int64 px = (FT_Int64)p0.x << 32;
|
||||
FT_Int64 py = (FT_Int64)p0.y << 32;
|
||||
|
||||
FT_UInt count = 1u << shift;
|
||||
|
||||
for (; count > 0; count--) {
|
||||
px += qx;
|
||||
py += qy;
|
||||
qx += rx;
|
||||
qy += ry;
|
||||
|
||||
gray_render_line( RAS_VAR_ (FT_Pos)(px >> 32), (FT_Pos)(py >> 32));
|
||||
}
|
||||
}
|
||||
|
||||
#else /* !BEZIER_USE_DDA */
|
||||
|
||||
/* Note that multiple attempts to speed up the function below
|
||||
* with SSE2 intrinsics, using various data layouts, have turned
|
||||
* out to be slower than the non-SIMD code below.
|
||||
*/
|
||||
static void
|
||||
gray_split_conic( FT_Vector* base )
|
||||
{
|
||||
|
@ -1078,7 +1260,15 @@ typedef ptrdiff_t FT_PtrDist;
|
|||
} while ( --draw );
|
||||
}
|
||||
|
||||
#endif /* !BEZIER_USE_DDA */
|
||||
|
||||
/* For cubic bezier, binary splits are still faster than DDA
|
||||
* because the splits are adaptive to how quickly each sub-arc
|
||||
* approaches their chord trisection points.
|
||||
*
|
||||
* It might be useful to experiment with SSE2 to speed up
|
||||
* gray_split_cubic() though.
|
||||
*/
|
||||
static void
|
||||
gray_split_cubic( FT_Vector* base )
|
||||
{
|
||||
|
@ -1169,7 +1359,6 @@ typedef ptrdiff_t FT_PtrDist;
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
gray_move_to( const FT_Vector* to,
|
||||
gray_PWorker worker )
|
||||
|
|
Loading…
Reference in New Issue