From c381aaa3d1e2dc68b83d0c6d390eb65abf9e9ae5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexei=20Podtelezhnikov=20=28=D0=90=D0=BB=D0=B5=D0=BA?=
 =?UTF-8?q?=D1=81=D0=B5=D0=B9=20=D0=9F=D0=BE=D0=B4=D1=82=D0=B5=D0=BB=D0=B5?=
 =?UTF-8?q?=D0=B6=D0=BD=D0=B8=D0=BA=D0=BE=D0=B2=29?= <apodtele@gmail.com>
Date: Thu, 11 Jan 2024 06:40:38 -0500
Subject: [PATCH] [smooth] Remove SSE2.

Benchmarking shows that rendering curves is faster without SSE2. This is
understandable because we deal with 2D space and simple calculations.
See !314 for testing results.

* src/smooth/ftgrays.c (gray_render_conic): Remove SSE2 code.
---
 src/smooth/ftgrays.c | 104 +++----------------------------------------
 1 file changed, 6 insertions(+), 98 deletions(-)

diff --git a/src/smooth/ftgrays.c b/src/smooth/ftgrays.c
index 4574da8bc..79b864c1f 100644
--- a/src/smooth/ftgrays.c
+++ b/src/smooth/ftgrays.c
@@ -997,49 +997,12 @@ typedef ptrdiff_t  FT_PtrDist;
 #endif
 
   /*
-   * Benchmarking shows that using DDA to flatten the quadratic Bézier arcs
-   * is slightly faster in the following cases:
-   *
-   *   - When the host CPU is 64-bit.
-   *   - When SSE2 SIMD registers and instructions are available (even on
-   *     x86).
-   *
-   * For other cases, using binary splits is actually slightly faster.
+   * For now, the code that uses DDA to render conic curves requires
+   * `FT_Int64` to be defined.  See for example
+   *    https://gitlab.freedesktop.org/freetype/freetype/-/issues/1071.
    */
-#if ( defined( __SSE2__ )                          ||   \
-      defined( __x86_64__ )                        ||   \
-      defined( _M_AMD64 )                          ||   \
-      ( defined( _M_IX86_FP ) && _M_IX86_FP >= 2 ) ) && \
-    !defined( __VMS )
-#  define FT_SSE2 1
-#else
-#  define FT_SSE2 0
-#endif
 
-#if FT_SSE2                || \
-    defined( __aarch64__ ) || \
-    defined( _M_ARM64 )
-#  define BEZIER_USE_DDA  1
-#else
-#  define BEZIER_USE_DDA  0
-#endif
-
-  /*
-   * For now, the code that depends on `BEZIER_USE_DDA` requires `FT_Int64`
-   * to be defined.  If `FT_INT64` is not defined, meaning there is no
-   * 64-bit type available, disable it to avoid compilation errors.  See for
-   * example https://gitlab.freedesktop.org/freetype/freetype/-/issues/1071.
-   */
-#if !defined( FT_INT64 )
-#  undef BEZIER_USE_DDA
-#  define BEZIER_USE_DDA  0
-#endif
-
-#if BEZIER_USE_DDA
-
-#if FT_SSE2
-#  include <emmintrin.h>
-#endif
+#ifdef FT_INT64
 
 #define LEFT_SHIFT( a, b )  (FT_Int64)( (FT_UInt64)(a) << (b) )
 
@@ -1151,61 +1114,6 @@ typedef ptrdiff_t  FT_PtrDist;
      *             = (B << (33 - N)) + (A << (32 - 2*N))
      */
 
-#if FT_SSE2
-    /* Experience shows that for small counts, SSE2 is actually slower. */
-    if ( count > 4 )
-    {
-      union
-      {
-        struct { FT_Int64  ax, ay, bx, by; }  i;
-        struct { __m128i  a, b; }  vec;
-
-      } u;
-
-      union
-      {
-        struct { FT_Int32  px_lo, px_hi, py_lo, py_hi; }  i;
-        __m128i  vec;
-
-      } v;
-
-      __m128i  p, q, r;
-
-
-      u.i.ax = ax;
-      u.i.ay = ay;
-      u.i.bx = bx;
-      u.i.by = by;
-
-      q = _mm_load_si128( &u.vec.b );
-      r = _mm_load_si128( &u.vec.a );
-
-      q = _mm_slli_epi64( q, shift + 17);
-      r = _mm_slli_epi64( r, shift + shift );
-      q = _mm_add_epi64( q, r );
-      r = _mm_add_epi64( r, r );
-
-      v.i.px_lo = 0;
-      v.i.px_hi = p0.x;
-      v.i.py_lo = 0;
-      v.i.py_hi = p0.y;
-
-      p = _mm_load_si128( &v.vec );
-
-      do
-      {
-        p = _mm_add_epi64( p, q );
-        q = _mm_add_epi64( q, r );
-
-        _mm_store_si128( &v.vec, p );
-
-        gray_render_line( RAS_VAR_ v.i.px_hi, v.i.py_hi );
-      } while ( --count );
-
-      return;
-    }
-#endif  /* FT_SSE2 */
-
     rx = LEFT_SHIFT( ax, shift + shift );
     ry = LEFT_SHIFT( ay, shift + shift );
 
@@ -1230,7 +1138,7 @@ typedef ptrdiff_t  FT_PtrDist;
     } while ( --count );
   }
 
-#else  /* !BEZIER_USE_DDA */
+#else  /* !FT_INT64 */
 
   /*
    * Note that multiple attempts to speed up the function below
@@ -1324,7 +1232,7 @@ typedef ptrdiff_t  FT_PtrDist;
     } while ( --draw );
   }
 
-#endif  /* !BEZIER_USE_DDA */
+#endif  /* !FT_INT64 */
 
 
   /*