diff --git a/ChangeLog b/ChangeLog index 5210b3582..f35316a5f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,15 @@ 2008-09-01 david turner + * include/freetype/ftoption.h, include/freetype/ftconfig.h, + builds/unix/ftconfig.in, include/freetype/freetype.h, + src/base/ftcalc.c: + Make FT_MulFix an inlined function. Also provide an assembler + implementation for ARM architectures. this is done to speedup + FreeType a little (on x86 3% when loading+hinting, 10% when + rendering, ARM savings are more important though). + Disable this by undefining FT_CONFIG_OPTION_INLINE_MULFIX in + ftconfig.h + * include/freetype/ftadvanc.h, src/base/ftadvanc.c, include/freetype/config/ftheader.h, include/freetype/freetype.h, src/base/Jamfile, src/base/rules.mk, src/cff/cffdrivr.c, diff --git a/builds/unix/ftconfig.in b/builds/unix/ftconfig.in index 1a9626465..6430abf76 100644 --- a/builds/unix/ftconfig.in +++ b/builds/unix/ftconfig.in @@ -197,6 +197,67 @@ FT_BEGIN_HEADER #endif /* FT_SIZEOF_LONG == 8 */ +#if !defined(FT_CONFIG_OPTION_NO_ASSEMBLER) +/* provide assembler fragments for performance-critical + * functions. these must be defined static __inline__ + * with GCC + */ +#if defined(__GNUC__) + +# if defined(__arm__) && !defined(__thumb__) +# define FT_MULFIX_ASSEMBLER FT_MulFix_arm + static __inline__ FT_Int32 + FT_MulFix_arm( FT_Int32 a, FT_Int32 b ) + { + register FT_Int32 t, t2; + asm __volatile__ ( + "smull %1, %2, %4, %3\n\t" /* (lo=%1,hi=%2) = a*b */ + "mov %0, %2, asr #31\n\t" /* %0 = (hi >> 31) */ + "add %0, %0, #0x8000\n\t" /* %0 += 0x8000 */ + "adds %1, %1, %0\n\t" /* %1 += %0 */ + "adc %2, %2, #0\n\t" /* %2 += carry */ + "mov %0, %1, lsr #16\n\t" /* %0 = %1 >> 16 */ + "orr %0, %2, lsl #16\n\t" /* %0 |= %2 << 16 */ + : "=r"(a), "=&r"(t2), "=&r"(t) + : "r"(a), "r"(b) + ); + return a; + } +# endif /* __arm__ */ + +# if defined(i386) +# define FT_MULFIX_ASSEMBLER FT_MulFix_i386 + static __inline__ FT_Int32 + FT_MulFix_i386( FT_Int32 a, FT_Int32 b ) + { + register FT_Int32 result; + + __asm__ __volatile__ ( + "imul %%edx\n" + "movl %%edx, %%ecx\n" + "sarl $31, %%ecx\n" + "addl $0x8000, %%ecx\n" + "addl %%ecx, %%eax\n" + "adcl $0, %%edx\n" + "shrl $16, %%eax\n" + "shll $16, %%edx\n" + "addl %%edx, %%eax\n" + : "=a"(result), "+d"(b) + : "a"(a) + : "%ecx" + ); + return result; + } +# endif /* i386 */ +#endif /* __GNUC__ */ +#endif /* !NO_ASSEMBLER */ + +#ifdef FT_CONFIG_OPTION_INLINE_MULFIX +# ifdef FT_MULFIX_ASSEMBLER +# define FT_MULFIX_INLINED FT_MULFIX_ASSEMBLER +# endif +#endif + #define FT_BEGIN_STMNT do { #define FT_END_STMNT } while ( 0 ) diff --git a/include/freetype/config/ftconfig.h b/include/freetype/config/ftconfig.h index 09b2cf951..0e9daf3b3 100644 --- a/include/freetype/config/ftconfig.h +++ b/include/freetype/config/ftconfig.h @@ -225,6 +225,67 @@ FT_BEGIN_HEADER #endif +#if !defined(FT_CONFIG_OPTION_NO_ASSEMBLER) +/* provide assembler fragments for performance-critical + * functions. these must be defined static __inline__ + * with GCC + */ +#if defined(__GNUC__) + +# if defined(__arm__) && !defined(__thumb__) +# define FT_MULFIX_ASSEMBLER FT_MulFix_arm + static __inline__ FT_Int32 + FT_MulFix_arm( FT_Int32 a, FT_Int32 b ) + { + register FT_Int32 t, t2; + asm __volatile__ ( + "smull %1, %2, %4, %3\n\t" /* (lo=%1,hi=%2) = a*b */ + "mov %0, %2, asr #31\n\t" /* %0 = (hi >> 31) */ + "add %0, %0, #0x8000\n\t" /* %0 += 0x8000 */ + "adds %1, %1, %0\n\t" /* %1 += %0 */ + "adc %2, %2, #0\n\t" /* %2 += carry */ + "mov %0, %1, lsr #16\n\t" /* %0 = %1 >> 16 */ + "orr %0, %2, lsl #16\n\t" /* %0 |= %2 << 16 */ + : "=r"(a), "=&r"(t2), "=&r"(t) + : "r"(a), "r"(b) + ); + return a; + } +# endif /* __arm__ */ + +# if defined(i386) +# define FT_MULFIX_ASSEMBLER FT_MulFix_i386 + static __inline__ FT_Int32 + FT_MulFix_i386( FT_Int32 a, FT_Int32 b ) + { + register FT_Int32 result; + + __asm__ __volatile__ ( + "imul %%edx\n" + "movl %%edx, %%ecx\n" + "sarl $31, %%ecx\n" + "addl $0x8000, %%ecx\n" + "addl %%ecx, %%eax\n" + "adcl $0, %%edx\n" + "shrl $16, %%eax\n" + "shll $16, %%edx\n" + "addl %%edx, %%eax\n" + : "=a"(result), "+d"(b) + : "a"(a) + : "%ecx" + ); + return result; + } +# endif /* i386 */ +#endif /* __GNUC__ */ +#endif /* !NO_ASSEMBLER */ + +#ifdef FT_CONFIG_OPTION_INLINE_MULFIX +# ifdef FT_MULFIX_ASSEMBLER +# define FT_MULFIX_INLINED FT_MULFIX_ASSEMBLER +# endif +#endif + /* determine whether we have a 64-bit int type for platforms without */ /* Autoconf */ diff --git a/include/freetype/config/ftoption.h b/include/freetype/config/ftoption.h index a2d61f906..a92e19bb2 100644 --- a/include/freetype/config/ftoption.h +++ b/include/freetype/config/ftoption.h @@ -115,6 +115,26 @@ FT_BEGIN_HEADER #undef FT_CONFIG_OPTION_FORCE_INT64 + /*************************************************************************/ + /* */ + /* When this macro is defined, do not try to use an assembler version */ + /* of performance-critical functions (e.g. FT_MulFix). you should only */ + /* do that to verify that the assembler function works properly, or even */ + /* to benchmarks the various implementations... */ +/* #define FT_CONFIG_OPTION_NO_ASSEMBLER */ + + /*************************************************************************/ + /* */ + /* When this macro is defined, try to use an inlined assembler version */ + /* of the FT_MulFix function, which appears to be a hotspot when loading */ + /* and hinting glyphs. */ + /* */ + /* note that if your compiler/cpu isn't supported, this will default to */ + /* the standard and portable implementation found in src/base/ftcalc.c */ + /* */ +#define FT_CONFIG_OPTION_INLINE_MULFIX + + /*************************************************************************/ /* */ /* LZW-compressed file support. */ diff --git a/include/freetype/freetype.h b/include/freetype/freetype.h index b0193c71e..9289ca5ed 100644 --- a/include/freetype/freetype.h +++ b/include/freetype/freetype.h @@ -3468,10 +3468,13 @@ FT_BEGIN_HEADER /* _second_ argument of this function; this can make a great */ /* difference. */ /* */ +#ifdef FT_MULFIX_INLINED +# define FT_MulFix(a,b) FT_MULFIX_INLINED(a,b) +#else FT_EXPORT( FT_Long ) FT_MulFix( FT_Long a, FT_Long b ); - +#endif /*************************************************************************/ /* */ diff --git a/src/base/ftcalc.c b/src/base/ftcalc.c index 7d2381bbd..75e89c22f 100644 --- a/src/base/ftcalc.c +++ b/src/base/ftcalc.c @@ -38,6 +38,9 @@ #include FT_INTERNAL_DEBUG_H #include FT_INTERNAL_OBJECTS_H +#ifdef FT_MULFIX_INLINED +#undef FT_MulFix +#endif /* we need to define a 64-bits data type here */ @@ -193,6 +196,9 @@ FT_MulFix( FT_Long a, FT_Long b ) { +#ifdef FT_MULFIX_ASSEMBLER + return FT_MULFIX_ASSEMBLER(a,b); +#else FT_Int s = 1; FT_Long c; @@ -202,6 +208,7 @@ c = (FT_Long)( ( (FT_Int64)a * b + 0x8000L ) >> 16 ); return ( s > 0 ) ? c : -c ; +#endif } @@ -413,30 +420,8 @@ FT_MulFix( FT_Long a, FT_Long b ) { - /* use inline assembly to speed up things a bit */ - -#if defined( __GNUC__ ) && defined( i386 ) - - FT_Long result; - - - __asm__ __volatile__ ( - "imul %%edx\n" - "movl %%edx, %%ecx\n" - "sarl $31, %%ecx\n" - "addl $0x8000, %%ecx\n" - "addl %%ecx, %%eax\n" - "adcl $0, %%edx\n" - "shrl $16, %%eax\n" - "shll $16, %%edx\n" - "addl %%edx, %%eax\n" - "mov %%eax, %0\n" - : "=a"(result), "+d"(b) - : "a"(a) - : "%ecx" - ); - return result; - +#ifdef FT_MULFIX_ASSEMBLER + return FT_MULFIX_ASSEMBLER(a,b); #elif 0 /*