From 7b308a29dd105074eea9c8d5953a182d325f74f1 Mon Sep 17 00:00:00 2001 From: Alexei Podtelezhnikov Date: Thu, 28 Sep 2023 22:56:15 -0400 Subject: [PATCH] * src/smooth/ftgrays.c (gray_render_conic) [SSE2]: Improve flow. --- src/smooth/ftgrays.c | 50 ++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/src/smooth/ftgrays.c b/src/smooth/ftgrays.c index 0918272f8..fbf1541ef 100644 --- a/src/smooth/ftgrays.c +++ b/src/smooth/ftgrays.c @@ -1095,16 +1095,17 @@ typedef ptrdiff_t FT_PtrDist; return; } - /* We can calculate the number of necessary bisections because */ + /* We can calculate the number of necessary segments because */ /* each bisection predictably reduces deviation exactly 4-fold. */ /* Even 32-bit deviation would vanish after 16 bisections. */ - shift = 0; + shift = 16; do { - dx >>= 2; - shift += 1; + dx >>= 2; + shift--; } while ( dx > ONE_PIXEL / 4 ); + count = 0x10000U >> shift; /* * The (P0,P1,P2) arc equation, for t in [0,1] range: @@ -1151,9 +1152,8 @@ typedef ptrdiff_t FT_PtrDist; */ #if FT_SSE2 - /* Experience shows that for small shift values, */ - /* SSE2 is actually slower. */ - if ( shift > 2 ) + /* Experience shows that for small counts, SSE2 is actually slower. */ + if ( count > 4 ) { union { @@ -1169,9 +1169,7 @@ typedef ptrdiff_t FT_PtrDist; } v; - __m128i a, b; - __m128i r, q, q2; - __m128i p; + __m128i p, q, r; u.i.ax = ax; @@ -1179,14 +1177,13 @@ typedef ptrdiff_t FT_PtrDist; u.i.bx = bx; u.i.by = by; - a = _mm_load_si128( &u.vec.a ); - b = _mm_load_si128( &u.vec.b ); + q = _mm_load_si128( &u.vec.b ); + r = _mm_load_si128( &u.vec.a ); - r = _mm_slli_epi64( a, 33 - 2 * shift ); - q = _mm_slli_epi64( b, 33 - shift ); - q2 = _mm_slli_epi64( a, 32 - 2 * shift ); - - q = _mm_add_epi64( q2, q ); + q = _mm_slli_epi64( q, shift + 17); + r = _mm_slli_epi64( r, shift + shift ); + q = _mm_add_epi64( q, r ); + r = _mm_add_epi64( r, r ); v.i.px_lo = 0; v.i.px_hi = p0.x; @@ -1195,7 +1192,7 @@ typedef ptrdiff_t FT_PtrDist; p = _mm_load_si128( &v.vec ); - for ( count = 1U << shift; count > 0; count-- ) + do { p = _mm_add_epi64( p, q ); q = _mm_add_epi64( q, r ); @@ -1203,22 +1200,25 @@ typedef ptrdiff_t FT_PtrDist; _mm_store_si128( &v.vec, p ); gray_render_line( RAS_VAR_ v.i.px_hi, v.i.py_hi ); - } + } while ( --count ); return; } #endif /* FT_SSE2 */ - rx = LEFT_SHIFT( ax, 33 - 2 * shift ); - ry = LEFT_SHIFT( ay, 33 - 2 * shift ); + rx = LEFT_SHIFT( ax, shift + shift ); + ry = LEFT_SHIFT( ay, shift + shift ); - qx = LEFT_SHIFT( bx, 33 - shift ) + LEFT_SHIFT( ax, 32 - 2 * shift ); - qy = LEFT_SHIFT( by, 33 - shift ) + LEFT_SHIFT( ay, 32 - 2 * shift ); + qx = LEFT_SHIFT( bx, shift + 17 ) + rx; + qy = LEFT_SHIFT( by, shift + 17 ) + ry; + + rx *= 2; + ry *= 2; px = LEFT_SHIFT( p0.x, 32 ); py = LEFT_SHIFT( p0.y, 32 ); - for ( count = 1U << shift; count > 0; count-- ) + do { px += qx; py += qy; @@ -1227,7 +1227,7 @@ typedef ptrdiff_t FT_PtrDist; gray_render_line( RAS_VAR_ (FT_Pos)( px >> 32 ), (FT_Pos)( py >> 32 ) ); - } + } while ( --count ); } #else /* !BEZIER_USE_DDA */