From 56cc2ad446e920164e823e78a21972cadb339bfc Mon Sep 17 00:00:00 2001
From: David Turner <david@freetype.org>
Date: Sat, 19 Jun 2021 10:26:53 +0200
Subject: [PATCH] [smooth] Implement Bezier quadratic arc flattenning with DDA

Benchmarking shows that this provides a very slighty performance
boost when rendering fonts with lots of quadratic bezier arcs,
compared to the recursive arc splitting, but only when SSE2 is
available, or on 64-bit CPUs.

On a 2017 Core i5-7300U CPU on Linux/x86_64:

  ./ftbench -p -s10 -t5 -cb .../DroidSansFallbackFull.ttf

    Before: 4.033 us/op  (best of 5 runs for all numbers)
    After:  3.876 us/op

  ./ftbench -p -s60 -t5 -cb .../DroidSansFallbackFull.ttf

    Before: 13.467 us/op
    After:  13.385 us/op
---
 ChangeLog            |  12 +++
 src/smooth/ftgrays.c | 191 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 202 insertions(+), 1 deletion(-)

diff --git a/ChangeLog b/ChangeLog
index ea269da30..a9d38aa7f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+2021-07-15  David Turner  <david@freetype.org>
+
+	[smooth] Implement Bezier quadratic arc flattenning with DDA
+
+	Benchmarking shows that this provides a very slighty performance
+	boost when rendering fonts with lots of quadratic bezier arcs,
+	compared to the recursive arc splitting, but only when SSE2 is
+	available, or on 64-bit CPUs.
+
+	* src/smooth/ftgrays.c (gray_render_conic): New implementation
+	based on DDA and optionally SSE2.
+
 2021-07-15  David Turner  <david@freetype.org>
 
 	[smooth] Minor speedup to smooth rasterizer
diff --git a/src/smooth/ftgrays.c b/src/smooth/ftgrays.c
index e66ec34a7..7158cd21c 100644
--- a/src/smooth/ftgrays.c
+++ b/src/smooth/ftgrays.c
@@ -993,6 +993,188 @@ typedef ptrdiff_t  FT_PtrDist;
 
 #endif
 
+/* Benchmarking shows that using DDA to flatten the quadratic bezier
+ * arcs is slightly faster in the following cases:
+ *
+ *   - When the host CPU is 64-bit.
+ *   - When SSE2 SIMD registers and instructions are available (even on x86).
+ *
+ * For other cases, using binary splits is actually slightly faster.
+ */
+#if defined(__SSE2__) || defined(__x86_64__) || defined(__aarch64__) || defined(_M_AMD64) || defined(_M_ARM64)
+#define BEZIER_USE_DDA  1
+#else
+#define BEZIER_USE_DDA  0
+#endif
+
+#if BEZIER_USE_DDA
+
+#include <emmintrin.h>
+
+  static void
+  gray_render_conic( RAS_ARG_ const FT_Vector*  control,
+                              const FT_Vector*  to )
+  {
+    FT_Vector  p0, p1, p2;
+
+    p0.x = ras.x;
+    p0.y = ras.y;
+    p1.x = UPSCALE( control->x );
+    p1.y = UPSCALE( control->y );
+    p2.x = UPSCALE( to->x );
+    p2.y = UPSCALE( to->y );
+
+    /* short-cut the arc that crosses the current band */
+    if ( ( TRUNC( p0.y ) >= ras.max_ey &&
+           TRUNC( p1.y ) >= ras.max_ey &&
+           TRUNC( p2.y ) >= ras.max_ey ) ||
+         ( TRUNC( p0.y ) <  ras.min_ey &&
+           TRUNC( p1.y ) <  ras.min_ey &&
+           TRUNC( p2.y ) <  ras.min_ey ) )
+    {
+      ras.x = p2.x;
+      ras.y = p2.y;
+      return;
+    }
+
+    TPos dx = FT_ABS( p0.x + p2.x - 2 * p1.x );
+    TPos dy = FT_ABS( p0.y + p2.y - 2 * p1.y );
+    if ( dx < dy )
+      dx = dy;
+
+    if ( dx <= ONE_PIXEL / 4 )
+    {
+      gray_render_line( RAS_VAR_ p2.x, p2.y );
+      return;
+    }
+
+    /* We can calculate the number of necessary bisections because  */
+    /* each bisection predictably reduces deviation exactly 4-fold. */
+    /* Even 32-bit deviation would vanish after 16 bisections.      */
+    int shift = 0;
+    do
+    {
+      dx   >>= 2;
+      shift += 1;
+    }
+    while (dx > ONE_PIXEL / 4);
+
+    /*
+     * The (P0,P1,P2) arc equation, for t in [0,1] range:
+     *
+     * P(t) = P0*(1-t)^2 + P1*2*t*(1-t) + P2*t^2
+     *
+     * P(t) = P0 + 2*(P1-P0)*t + (P0+P2-2*P1)*t^2
+     *      = P0 + 2*B*t + A*t^2
+     *
+     *    for A = P0 + P2 - 2*P1
+     *    and B = P1 - P0
+     *
+     * Let's consider the difference when advancing by a small
+     * parameter h:
+     *
+     *    Q(h,t) = P(t+h) - P(t) = 2*B*h + A*h^2 + 2*A*h*t
+     *
+     * And then its own difference:
+     *
+     *    R(h,t) = Q(h,t+h) - Q(h,t) = 2*A*h*h = R (constant)
+     *
+     * Since R is always a constant, it is possible to compute
+     * successive positions with:
+     *
+     *     P = P0
+     *     Q = Q(h,0) = 2*B*h + A*h*h
+     *     R = 2*A*h*h
+     *
+     *   loop:
+     *     P += Q
+     *     Q += R
+     *     EMIT(P)
+     *
+     * To ensure accurate results, perform computations on 64-bit
+     * values, after scaling them by 2^32:
+     *
+     *     R << 32   = 2 * A << (32 - N - N)
+     *               = A << (33 - 2 *N)
+     *
+     *     Q << 32   = (2 * B << (32 - N)) + (A << (32 - N - N))
+     *               = (B << (33 - N)) + (A << (32 - N - N))
+     */
+#ifdef __SSE2__
+    /* Experience shows that for small shift values, SSE2 is actually slower. */
+    if (shift > 2) {
+      union {
+        struct { FT_Int64 ax, ay, bx, by; } i;
+        struct { __m128i a, b; } vec;
+      } u;
+
+      u.i.ax = p0.x + p2.x - 2 * p1.x;
+      u.i.ay = p0.y + p2.y - 2 * p1.y;
+      u.i.bx = p1.x - p0.x;
+      u.i.by = p1.y - p0.y;
+
+      __m128i a = _mm_load_si128(&u.vec.a);
+      __m128i b = _mm_load_si128(&u.vec.b);
+
+      __m128i r = _mm_slli_epi64(a, 33 - 2 * shift);
+      __m128i q = _mm_slli_epi64(b, 33 - shift);
+      __m128i q2 = _mm_slli_epi64(a, 32 - 2 * shift);
+      q = _mm_add_epi64(q2, q);
+
+      union {
+        struct { FT_Int32  px_lo, px_hi, py_lo, py_hi; } i;
+        __m128i vec;
+      } v;
+      v.i.px_lo = 0;
+      v.i.px_hi = p0.x;
+      v.i.py_lo = 0;
+      v.i.py_hi = p0.y;
+
+      __m128i p = _mm_load_si128(&v.vec);
+
+      for (unsigned count = (1u << shift); count > 0; count--) {
+        p = _mm_add_epi64(p, q);
+        q = _mm_add_epi64(q, r);
+
+        _mm_store_si128(&v.vec, p);
+
+        gray_render_line( RAS_VAR_ v.i.px_hi, v.i.py_hi);
+      }
+      return;
+    }
+#endif  /* !__SSE2__ */
+    FT_Int64 ax = p0.x + p2.x - 2 * p1.x;
+    FT_Int64 ay = p0.y + p2.y - 2 * p1.y;
+    FT_Int64 bx = p1.x - p0.x;
+    FT_Int64 by = p1.y - p0.y;
+
+    FT_Int64 rx = ax << (33 - 2 * shift);
+    FT_Int64 ry = ay << (33 - 2 * shift);
+
+    FT_Int64 qx = (bx << (33 - shift)) + (ax << (32 - 2 * shift));
+    FT_Int64 qy = (by << (33 - shift)) + (ay << (32 - 2 * shift));
+
+    FT_Int64 px = (FT_Int64)p0.x << 32;
+    FT_Int64 py = (FT_Int64)p0.y << 32;
+
+	FT_UInt count = 1u << shift;
+
+    for (; count > 0; count--) {
+      px += qx;
+      py += qy;
+      qx += rx;
+      qy += ry;
+
+      gray_render_line( RAS_VAR_ (FT_Pos)(px >> 32), (FT_Pos)(py >> 32));
+    }
+  }
+
+#else  /* !BEZIER_USE_DDA */
+
+  /* Note that multiple attempts to speed up the function below
+   * with SSE2 intrinsics, using various data layouts, have turned
+   * out to be slower than the non-SIMD code below.
+   */
   static void
   gray_split_conic( FT_Vector*  base )
   {
@@ -1078,7 +1260,15 @@ typedef ptrdiff_t  FT_PtrDist;
     } while ( --draw );
   }
 
+#endif  /* !BEZIER_USE_DDA */
 
+  /* For cubic bezier, binary splits are still faster than DDA
+   * because the splits are adaptive to how quickly each sub-arc
+   * approaches their chord trisection points.
+   *
+   * It might be useful to experiment with SSE2 to speed up
+   * gray_split_cubic() though.
+   */
   static void
   gray_split_cubic( FT_Vector*  base )
   {
@@ -1169,7 +1359,6 @@ typedef ptrdiff_t  FT_PtrDist;
     }
   }
 
-
   static int
   gray_move_to( const FT_Vector*  to,
                 gray_PWorker      worker )