diff --git a/ChangeLog b/ChangeLog index a9d38aa7f..b7daa70c2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,38 +1,88 @@ +2021-07-15 Ben Wagner + + * src/smooth/ftgrays.c: Guard inclusion of `emmintrin.h`. + + Guard inclusion of `emmintrin.h` with `#ifdef __SSE2__`. The gcc + version of this header, `xmmintrin.h`, and `mmintrin.h` check that + the appropriate defines are set before defining anything (are + internally guarded). However, the clang versions of these includes + are not internally guarded. As a result of this, externally guard + the inclusion of these headers. + 2021-07-15 David Turner - [smooth] Implement Bezier quadratic arc flattenning with DDA + [smooth] Implement Bézier quadratic arc flattening with DDA. Benchmarking shows that this provides a very slighty performance - boost when rendering fonts with lots of quadratic bezier arcs, + boost when rendering fonts with lots of quadratic Bézier arcs, compared to the recursive arc splitting, but only when SSE2 is available, or on 64-bit CPUs. + On a 2017 Core i5-7300U CPU on Linux/x86_64: + + ftbench -p -s10 -t5 -cb DroidSansFallbackFull.ttf + + Before: 4.033 us/op (best of 5 runs for all numbers) + After: 3.876 us/op + + ftbench -p -s60 -t5 -cb DroidSansFallbackFull.ttf + + Before: 13.467 us/op + After: 13.385 us/op + * src/smooth/ftgrays.c (gray_render_conic): New implementation based on DDA and optionally SSE2. 2021-07-15 David Turner - [smooth] Minor speedup to smooth rasterizer + [smooth] Minor speedup to smooth rasterizer. - This speeds up the smooth rasterizer by avoiding a conditional + This speeds up the smooth rasterizer by avoiding conditional branches in the hot path. - * src/smooth/ftgrays.c: Define a null cell used to both as a - sentinel for all linked-lists, and to accumulate coverage and - area values for "out-of-bounds" cell positions without a - conditional check. + - Define a fixed 'null cell', which will be pointed to whenever the + current cell is outside of the current target region. This avoids + a `ras.cell != NULL` check in the `FT_INTEGRATE` macro. + + - Also use the null cell as a sentinel at the end of all `ycells` + linked-lists, by setting its x coordinate to `INT_MAX`. This + avoids a `if (!cell)` check in `gray_set_cell` as well. + + - Slightly change the worker struct fields to perform a little less + operations during rendering. + + Example results (on a 2013 Corei5-3337U CPU) + + out/ftbench -p -s10 -t5 -bc DroidSansFallbackFull.ttf + + Before: 5.472 us/op + After: 5.275 us/op + + out/ftbench -p -s60 -t5 -bc DroidSansFallbackFull.ttf + + Before: 17.988 us/op + After: 17.389 us/op + + * src/smooth/ftgrays.c (grat_TWorker): Replace `num_cells` field with + `cell_free` and `cell_limit`. + (NULL_CELL_PTR, CELL_MAX_X_VALUE, CELL_IS_NULL): New macros. + (gray_dump_cells, gray_set_cell, gray_sweep, gray_sweep_direct, + gray_convert_glyph_inner, gray_convert_glyph): Updated. 2021-07-15 David Turner - Replaces download-test-fonts.sh with download-test-fonts.py which - does the same work, and also avoids downloading anything if the - files are already installed with the right content. + [tests] Rewrite download script in Python3. - Now uses the first 8 byte of each file's sha256 hash for the digest. + This commit replaces the bash script with a Python script that does + the same work, plus avoiding to download anything if the files are + already installed with the right content. - * tests/scripts/download-test-fonts.sh: Removed - * tests/scripts/download-test-fonts.py: New script - * tests/README.md: Updated + We now use the first 8 bytes of each file's sha256 hash for the + digest. + + * tests/scripts/download-test-fonts.sh: Removed. + * tests/scripts/download-test-fonts.py: New script. + * tests/README.md: Updated. 2021-07-15 Alex Richardson diff --git a/src/smooth/ftgrays.c b/src/smooth/ftgrays.c index 5e04ff41b..b802030e7 100644 --- a/src/smooth/ftgrays.c +++ b/src/smooth/ftgrays.c @@ -487,8 +487,8 @@ typedef ptrdiff_t FT_PtrDist; PCell cell_free; /* call allocation next free slot */ PCell cell_limit; /* cell allocation limit */ - PCell* ycells; /* array of cell linked-lists, one per */ - /* vertical coordinate in the current band. */ + PCell* ycells; /* array of cell linked-lists; one per */ + /* vertical coordinate in the current band */ PCell cells; /* cell storage area */ FT_PtrDist max_cells; /* cell storage capacity */ @@ -513,19 +513,21 @@ typedef ptrdiff_t FT_PtrDist; static gray_TWorker ras; #endif -/* Return a pointer to the "null cell", used as a sentinel at the end */ -/* of all ycells[] linked lists. Its x coordinate should be maximal */ -/* to ensure no NULL checks are necessary when looking for an insertion */ -/* point in gray_set_cell(). Other loops should check the cell pointer */ -/* with CELL_IS_NULL() to detect the end of the list. */ -#define NULL_CELL_PTR(ras) (ras).cells + /* + * Return a pointer to the 'null cell', used as a sentinel at the end of + * all `ycells` linked lists. Its x coordinate should be maximal to + * ensure no NULL checks are necessary when looking for an insertion point + * in `gray_set_cell`. Other loops should check the cell pointer with + * CELL_IS_NULL() to detect the end of the list. + */ +#define NULL_CELL_PTR( ras ) (ras).cells -/* The |x| value of the null cell. Must be the largest possible */ -/* integer value stored in a TCell.x field. */ + /* The |x| value of the null cell. Must be the largest possible */ + /* integer value stored in a `TCell.x` field. */ #define CELL_MAX_X_VALUE INT_MAX -/* Return true iff |cell| points to the null cell. */ -#define CELL_IS_NULL(cell) ((cell)->x == CELL_MAX_X_VALUE) + /* Return true iff |cell| points to the null cell. */ +#define CELL_IS_NULL( cell ) ( (cell)->x == CELL_MAX_X_VALUE ) #define FT_INTEGRATE( ras, a, b ) \ @@ -556,7 +558,7 @@ typedef ptrdiff_t FT_PtrDist; printf( "%3d:", y ); - for ( ; !CELL_IS_NULL(cell); cell = cell->next ) + for ( ; !CELL_IS_NULL( cell ); cell = cell->next ) printf( " (%3d, c:%4d, a:%6d)", cell->x, cell->cover, cell->area ); printf( "\n" ); @@ -584,9 +586,11 @@ typedef ptrdiff_t FT_PtrDist; /* Note that if a cell is to the left of the clipping region, it is */ /* actually set to the (min_ex-1) horizontal position. */ - TCoord ey_index = ey - ras.min_ey; + TCoord ey_index = ey - ras.min_ey; + + if ( ey_index < 0 || ey_index >= ras.count_ey || ex >= ras.max_ex ) - ras.cell = NULL_CELL_PTR(ras); + ras.cell = NULL_CELL_PTR( ras ); else { PCell* pcell = ras.ycells + ey_index; @@ -610,7 +614,7 @@ typedef ptrdiff_t FT_PtrDist; /* insert new cell */ cell = ras.cell_free++; - if (cell >= ras.cell_limit) + if ( cell >= ras.cell_limit ) ft_longjmp( ras.jump_buffer, 1 ); cell->x = ex; @@ -978,6 +982,7 @@ typedef ptrdiff_t FT_PtrDist; } gray_set_cell( RAS_VAR_ ex1, ey1 ); + } while ( ex1 != ex2 || ey1 != ey2 ); } @@ -987,30 +992,37 @@ typedef ptrdiff_t FT_PtrDist; FT_INTEGRATE( ras, fy2 - fy1, fx1 + fx2 ); End: - ras.x = to_x; - ras.y = to_y; + ras.x = to_x; + ras.y = to_y; } #endif -/* Benchmarking shows that using DDA to flatten the quadratic bezier - * arcs is slightly faster in the following cases: - * - * - When the host CPU is 64-bit. - * - When SSE2 SIMD registers and instructions are available (even on x86). - * - * For other cases, using binary splits is actually slightly faster. - */ -#if defined(__SSE2__) || defined(__x86_64__) || defined(__aarch64__) || defined(_M_AMD64) || defined(_M_ARM64) -#define BEZIER_USE_DDA 1 + /* + * Benchmarking shows that using DDA to flatten the quadratic Bézier arcs + * is slightly faster in the following cases: + * + * - When the host CPU is 64-bit. + * - When SSE2 SIMD registers and instructions are available (even on + * x86). + * + * For other cases, using binary splits is actually slightly faster. + */ +#if defined( __SSE2__ ) || \ + defined( __x86_64__ ) || \ + defined( __aarch64__ ) || \ + defined( _M_AMD64 ) || \ + defined( _M_ARM64 ) +# define BEZIER_USE_DDA 1 #else -#define BEZIER_USE_DDA 0 +# define BEZIER_USE_DDA 0 #endif + #if BEZIER_USE_DDA #ifdef __SSE2__ -#include +# include #endif static void @@ -1058,8 +1070,8 @@ typedef ptrdiff_t FT_PtrDist; { dx >>= 2; shift += 1; - } - while (dx > ONE_PIXEL / 4); + + } while ( dx > ONE_PIXEL / 4 ); /* * The (P0,P1,P2) arc equation, for t in [0,1] range: @@ -1102,12 +1114,17 @@ typedef ptrdiff_t FT_PtrDist; * Q << 32 = (2 * B << (32 - N)) + (A << (32 - N - N)) * = (B << (33 - N)) + (A << (32 - N - N)) */ + #ifdef __SSE2__ - /* Experience shows that for small shift values, SSE2 is actually slower. */ - if (shift > 2) { - union { - struct { FT_Int64 ax, ay, bx, by; } i; - struct { __m128i a, b; } vec; + /* Experience shows that for small shift values, */ + /* SSE2 is actually slower. */ + if ( shift > 2 ) + { + union + { + struct { FT_Int64 ax, ay, bx, by; } i; + struct { __m128i a, b; } vec; + } u; u.i.ax = p0.x + p2.x - 2 * p1.x; @@ -1138,10 +1155,11 @@ typedef ptrdiff_t FT_PtrDist; p = _mm_add_epi64(p, q); q = _mm_add_epi64(q, r); - _mm_store_si128(&v.vec, p); + _mm_store_si128( &v.vec, p ); - gray_render_line( RAS_VAR_ v.i.px_hi, v.i.py_hi); + gray_render_line( RAS_VAR_ v.i.px_hi, v.i.py_hi ); } + return; } #endif /* !__SSE2__ */ @@ -1167,13 +1185,15 @@ typedef ptrdiff_t FT_PtrDist; qx += rx; qy += ry; - gray_render_line( RAS_VAR_ (FT_Pos)(px >> 32), (FT_Pos)(py >> 32)); + gray_render_line( RAS_VAR_ (FT_Pos)( px >> 32 ), + (FT_Pos)( py >> 32 ) ); } } #else /* !BEZIER_USE_DDA */ - /* Note that multiple attempts to speed up the function below + /* + * Note that multiple attempts to speed up the function below * with SSE2 intrinsics, using various data layouts, have turned * out to be slower than the non-SIMD code below. */ @@ -1264,12 +1284,14 @@ typedef ptrdiff_t FT_PtrDist; #endif /* !BEZIER_USE_DDA */ - /* For cubic bezier, binary splits are still faster than DDA + + /* + * For cubic Bézier, binary splits are still faster than DDA * because the splits are adaptive to how quickly each sub-arc * approaches their chord trisection points. * * It might be useful to experiment with SSE2 to speed up - * gray_split_cubic() though. + * `gray_split_cubic`, though. */ static void gray_split_cubic( FT_Vector* base ) @@ -1361,6 +1383,7 @@ typedef ptrdiff_t FT_PtrDist; } } + static int gray_move_to( const FT_Vector* to, gray_PWorker worker ) @@ -1428,7 +1451,7 @@ typedef ptrdiff_t FT_PtrDist; unsigned char* line = ras.target.origin - ras.target.pitch * y; - for ( ; !CELL_IS_NULL(cell); cell = cell->next ) + for ( ; !CELL_IS_NULL( cell ); cell = cell->next ) { if ( cover != 0 && cell->x > x ) { @@ -1476,7 +1499,7 @@ typedef ptrdiff_t FT_PtrDist; TArea area; - for ( ; !CELL_IS_NULL(cell); cell = cell->next ) + for ( ; !CELL_IS_NULL( cell ); cell = cell->next ) { if ( cover != 0 && cell->x > x ) { @@ -1898,19 +1921,19 @@ typedef ptrdiff_t FT_PtrDist; /* memory management */ n = ( height * sizeof ( PCell ) + sizeof ( TCell ) - 1 ) / sizeof ( TCell ); - ras.cells = buffer + n; - ras.max_cells = (FT_PtrDist)( FT_MAX_GRAY_POOL - n ); + ras.cells = buffer + n; + ras.max_cells = (FT_PtrDist)( FT_MAX_GRAY_POOL - n ); ras.cell_limit = ras.cells + ras.max_cells; - ras.ycells = (PCell*)buffer; + ras.ycells = (PCell*)buffer; - /* Initialize the null cell is at the start of the 'cells' array. */ - /* Note that this requires ras.cell_free initialization to skip */ - /* over the first entry in the array. */ - PCell null_cell = NULL_CELL_PTR(ras); - null_cell->x = CELL_MAX_X_VALUE; - null_cell->area = 0; - null_cell->cover = 0; - null_cell->next = NULL;; + /* Initialize the null cell at the start of the `cells` array. */ + /* Note that this requires `ras.cell_free` initialization to skip */ + /* over the first entry in the array. */ + PCell null_cell = NULL_CELL_PTR( ras ); + null_cell->x = CELL_MAX_X_VALUE; + null_cell->area = 0; + null_cell->cover = 0; + null_cell->next = NULL;; for ( y = yMin; y < yMax; ) { @@ -1928,7 +1951,8 @@ typedef ptrdiff_t FT_PtrDist; TCoord w; int error; - for (w = 0; w < width; ++w) + + for ( w = 0; w < width; ++w ) ras.ycells[w] = null_cell; ras.cell_free = ras.cells + 1; /* NOTE: Skip over the null cell. */ diff --git a/tests/scripts/download-test-fonts.py b/tests/scripts/download-test-fonts.py index cab133daf..52b742e22 100755 --- a/tests/scripts/download-test-fonts.py +++ b/tests/scripts/download-test-fonts.py @@ -1,8 +1,7 @@ #!/usr/bin/env python3 -"""Download test fonts used by the FreeType regression test programs. -These will be copied to $FREETYPE/tests/data/ by default. -""" +"""Download test fonts used by the FreeType regression test programs. These +will be copied to $FREETYPE/tests/data/ by default.""" import argparse import collections @@ -15,8 +14,8 @@ import zipfile from typing import Callable, List, Optional, Tuple -# The list of download items describing the font files to install. -# Each download item is a dictionary with one of the following schemas: +# The list of download items describing the font files to install. Each +# download item is a dictionary with one of the following schemas: # # - File item: # @@ -28,8 +27,8 @@ from typing import Callable, List, Optional, Tuple # install_name # Type: file name string # Required: No -# Description: Installation name for the font file, only provided if it -# must be different from the original URL's basename. +# Description: Installation name for the font file, only provided if +# it must be different from the original URL's basename. # # hex_digest # Type: hexadecimal string @@ -39,7 +38,7 @@ from typing import Callable, List, Optional, Tuple # - Zip items: # # These items correspond to one or more font files that are embedded in a -# remote zip archive. Each entry has the following fields: +# remote zip archive. Each entry has the following fields: # # zip_url # Type: URL string. @@ -52,23 +51,25 @@ from typing import Callable, List, Optional, Tuple # Description: A list of entries describing a single font file to be # extracted from the archive # -# Apart from that, some schemas are used for dictionaries used inside download -# items: +# Apart from that, some schemas are used for dictionaries used inside +# download items: # # - File entries: # -# These are dictionaries describing a single font file to extract from an archive. +# These are dictionaries describing a single font file to extract from an +# archive. # # filename # Type: file path string # Required: Yes -# Description: Path of source file, relative to the archive's top-level directory. +# Description: Path of source file, relative to the archive's +# top-level directory. # # install_name # Type: file name string # Required: No -# Description: Installation name for the font file, only provided if it must be -# different from the original filename value. +# Description: Installation name for the font file; only provided if +# it must be different from the original filename value. # # hex_digest # Type: hexadecimal string @@ -90,7 +91,8 @@ _DOWNLOAD_ITEMS = [ def digest_data(data: bytes): - """Compute the digest of a given input byte string, which are the first 8 bytes of its sha256 hash.""" + """Compute the digest of a given input byte string, which are the first + 8 bytes of its sha256 hash.""" m = hashlib.sha256() m.update(data) return m.digest()[:8] @@ -155,14 +157,16 @@ def extract_file_from_zip_archive( Args: archive: Input ZipFile objec. - archive_name: Archive name or URL, only used to generate a human-readable error - message. + archive_name: Archive name or URL, only used to generate a + human-readable error message. + filepath: Input filepath in archive. expected_digest: Optional digest for the file. Returns: A new File instance corresponding to the extract file. Raises: - ValueError if expected_digest is not None and does not match the extracted file. + ValueError if expected_digest is not None and does not match the + extracted file. """ file = archive.open(filepath) if expected_digest is not None: @@ -181,7 +185,8 @@ def _get_and_install_file( force_download: bool, get_content: Callable[[], bytes], ) -> bool: - if not force_download and hex_digest is not None and os.path.exists(install_path): + if not force_download and hex_digest is not None \ + and os.path.exists(install_path): with open(install_path, "rb") as f: content: bytes = f.read() if bytes.fromhex(hex_digest) == digest_data(content): @@ -200,14 +205,15 @@ def download_and_install_item( Args: item: Download item as a dictionary, see above for schema. install_dir: Installation directory. - force_download: Set to True to force download and installation, even if - the font file is already installed with the right content. + force_download: Set to True to force download and installation, even + if the font file is already installed with the right content. Returns: - A list of (install_name, status) tuples, where 'install_name' is the file's - installation name under 'install_dir', and 'status' is a boolean that is True - to indicate that the file was downloaded and installed, or False to indicate that - the file is already installed with the right content. + A list of (install_name, status) tuples, where 'install_name' is the + file's installation name under 'install_dir', and 'status' is a + boolean that is True to indicate that the file was downloaded and + installed, or False to indicate that the file is already installed + with the right content. """ if "file_url" in item: file_url = item["file_url"] @@ -284,10 +290,13 @@ def main(): for install_name, status in download_and_install_item( item, args.install_dir, args.force ): - print("%s %s" % (install_name, "INSTALLED" if status else "UP-TO-DATE")) + print("%s %s" % (install_name, + "INSTALLED" if status else "UP-TO-DATE")) return 0 if __name__ == "__main__": sys.exit(main()) + +# EOF