From c9d7c03fa1b5c6244759ade814c546da04080646 Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Tue, 15 Aug 2017 08:48:17 +0200 Subject: [PATCH] [sfnt] Speed up PNG image loading. This reduces the overhead of `premultiply_data' by 60%. * src/sfnt/pngshim.c (premultiply_data): Provide code which uses gcc's (and clang's) `vector_byte' attribute to process 4 pixels at a time. --- ChangeLog | 10 +++++++++ src/sfnt/pngshim.c | 53 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 60 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index 4091498f9..06ea99e9e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +2017-08-15 Behdad Esfahbod + + [sfnt] Speed up PNG image loading. + + This reduces the overhead of `premultiply_data' by 60%. + + * src/sfnt/pngshim.c (premultiply_data): Provide code which uses + gcc's (and clang's) `vector_byte' attribute to process 4 pixels at a + time. + 2017-08-11 Werner Lemberg [sfnt, truetype] Improve handling of missing sbits. diff --git a/src/sfnt/pngshim.c b/src/sfnt/pngshim.c index b9b296ea5..90a56699f 100644 --- a/src/sfnt/pngshim.c +++ b/src/sfnt/pngshim.c @@ -49,18 +49,65 @@ } - /* Premultiplies data and converts RGBA bytes => native endian. */ + /* Premultiplies data and converts RGBA bytes => BGRA. */ static void premultiply_data( png_structp png, png_row_infop row_info, png_bytep data ) { - unsigned int i; + unsigned int i = 0, limit; FT_UNUSED( png ); + /* the `vector_size' attribute was introduced in gcc 3.1, which */ + /* predates clang; the `__BYTE_ORDER__' preprocessor symbol was */ + /* introduced in gcc 4.6 and clang 3.2, respectively */ +#if ( ( defined( __GNUC__ ) && \ + ( ( __GNUC__ >= 5 ) || \ + ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 6 ) ) ) ) || \ + ( defined( __clang__ ) && \ + ( ( __clang_major__ >= 4 ) || \ + ( ( __clang_major__ == 3 ) && ( __clang_minor__ >= 2 ) ) ) ) ) && \ + defined( __OPTIMIZE__ ) && \ + __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - for ( i = 0; i < row_info->rowbytes; i += 4 ) + typedef unsigned short v82 __attribute__(( vector_size( 16 ) )); + + + /* process blocks of 16 bytes in one rush, which gives a nice speed-up */ + limit = row_info->rowbytes - 16 + 1; + for ( ; i < limit; i += 16 ) + { + char* base = &data[i]; + + v82 s, s0, s1, a; + v82 ma = { 1, 1, 3, 3, 5, 5, 7, 7 }; + v82 o1 = { 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF }; + v82 m0 = { 1, 0, 3, 2, 5, 4, 7, 6 }; + + + memcpy( &s, base, 16 ); /* RGBA RGBA RGBA RGBA */ + s0 = s & 0xFF; /* R B R B R B R B */ + s1 = s >> 8; /* G A G A G A G A */ + + a = __builtin_shuffle( s1, ma ); /* A A A A A A A A */ + s1 |= o1; /* G 1 G 1 G 1 G 1 */ + s0 = __builtin_shuffle( s0, m0 ); /* B R B R B R B R */ + + s0 *= a; + s1 *= a; + s0 += 0x80; + s1 += 0x80; + s0 = ( s0 + ( s0 >> 8 ) ) >> 8; + s1 = ( s1 + ( s1 >> 8 ) ) >> 8; + + s = s0 | ( s1 << 8 ); + memcpy( base, &s, 16 ); + } +#endif /* use `vector_size' */ + + limit = row_info->rowbytes; + for ( ; i < limit; i += 4 ) { unsigned char* base = &data[i]; unsigned int alpha = base[3];