From c381aaa3d1e2dc68b83d0c6d390eb65abf9e9ae5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexei=20Podtelezhnikov=20=28=D0=90=D0=BB=D0=B5=D0=BA?= =?UTF-8?q?=D1=81=D0=B5=D0=B9=20=D0=9F=D0=BE=D0=B4=D1=82=D0=B5=D0=BB=D0=B5?= =?UTF-8?q?=D0=B6=D0=BD=D0=B8=D0=BA=D0=BE=D0=B2=29?= Date: Thu, 11 Jan 2024 06:40:38 -0500 Subject: [PATCH] [smooth] Remove SSE2. Benchmarking shows that rendering curves is faster without SSE2. This is understandable because we deal with 2D space and simple calculations. See !314 for testing results. * src/smooth/ftgrays.c (gray_render_conic): Remove SSE2 code. --- src/smooth/ftgrays.c | 104 +++---------------------------------------- 1 file changed, 6 insertions(+), 98 deletions(-) diff --git a/src/smooth/ftgrays.c b/src/smooth/ftgrays.c index 4574da8bc..79b864c1f 100644 --- a/src/smooth/ftgrays.c +++ b/src/smooth/ftgrays.c @@ -997,49 +997,12 @@ typedef ptrdiff_t FT_PtrDist; #endif /* - * Benchmarking shows that using DDA to flatten the quadratic Bézier arcs - * is slightly faster in the following cases: - * - * - When the host CPU is 64-bit. - * - When SSE2 SIMD registers and instructions are available (even on - * x86). - * - * For other cases, using binary splits is actually slightly faster. + * For now, the code that uses DDA to render conic curves requires + * `FT_Int64` to be defined. See for example + * https://gitlab.freedesktop.org/freetype/freetype/-/issues/1071. */ -#if ( defined( __SSE2__ ) || \ - defined( __x86_64__ ) || \ - defined( _M_AMD64 ) || \ - ( defined( _M_IX86_FP ) && _M_IX86_FP >= 2 ) ) && \ - !defined( __VMS ) -# define FT_SSE2 1 -#else -# define FT_SSE2 0 -#endif -#if FT_SSE2 || \ - defined( __aarch64__ ) || \ - defined( _M_ARM64 ) -# define BEZIER_USE_DDA 1 -#else -# define BEZIER_USE_DDA 0 -#endif - - /* - * For now, the code that depends on `BEZIER_USE_DDA` requires `FT_Int64` - * to be defined. If `FT_INT64` is not defined, meaning there is no - * 64-bit type available, disable it to avoid compilation errors. See for - * example https://gitlab.freedesktop.org/freetype/freetype/-/issues/1071. - */ -#if !defined( FT_INT64 ) -# undef BEZIER_USE_DDA -# define BEZIER_USE_DDA 0 -#endif - -#if BEZIER_USE_DDA - -#if FT_SSE2 -# include -#endif +#ifdef FT_INT64 #define LEFT_SHIFT( a, b ) (FT_Int64)( (FT_UInt64)(a) << (b) ) @@ -1151,61 +1114,6 @@ typedef ptrdiff_t FT_PtrDist; * = (B << (33 - N)) + (A << (32 - 2*N)) */ -#if FT_SSE2 - /* Experience shows that for small counts, SSE2 is actually slower. */ - if ( count > 4 ) - { - union - { - struct { FT_Int64 ax, ay, bx, by; } i; - struct { __m128i a, b; } vec; - - } u; - - union - { - struct { FT_Int32 px_lo, px_hi, py_lo, py_hi; } i; - __m128i vec; - - } v; - - __m128i p, q, r; - - - u.i.ax = ax; - u.i.ay = ay; - u.i.bx = bx; - u.i.by = by; - - q = _mm_load_si128( &u.vec.b ); - r = _mm_load_si128( &u.vec.a ); - - q = _mm_slli_epi64( q, shift + 17); - r = _mm_slli_epi64( r, shift + shift ); - q = _mm_add_epi64( q, r ); - r = _mm_add_epi64( r, r ); - - v.i.px_lo = 0; - v.i.px_hi = p0.x; - v.i.py_lo = 0; - v.i.py_hi = p0.y; - - p = _mm_load_si128( &v.vec ); - - do - { - p = _mm_add_epi64( p, q ); - q = _mm_add_epi64( q, r ); - - _mm_store_si128( &v.vec, p ); - - gray_render_line( RAS_VAR_ v.i.px_hi, v.i.py_hi ); - } while ( --count ); - - return; - } -#endif /* FT_SSE2 */ - rx = LEFT_SHIFT( ax, shift + shift ); ry = LEFT_SHIFT( ay, shift + shift ); @@ -1230,7 +1138,7 @@ typedef ptrdiff_t FT_PtrDist; } while ( --count ); } -#else /* !BEZIER_USE_DDA */ +#else /* !FT_INT64 */ /* * Note that multiple attempts to speed up the function below @@ -1324,7 +1232,7 @@ typedef ptrdiff_t FT_PtrDist; } while ( --draw ); } -#endif /* !BEZIER_USE_DDA */ +#endif /* !FT_INT64 */ /*