[smooth] Faster bitmap sweeping.

Selecting the fill rule or checking the direct mode each time we call
`gray_hline' is sub-optimal.  This effectively splits the direct mode
into a separate code path while inlining `gray_hline' and saving 5-7%
of rendering time.

* src/smooth/ftgrays.c (gray_hline): Eliminated in favor of...
(FT_FILL_RULE, FT_GRAY_SET): ... these new macros...
(gray_sweep): ... inlined here.
(gray_sweep_direct): New function that handles the direct span buffer.
(gray_TWorker): Remove the span buffer.
(gray_raster_render, gray_convert_glyph): Updated.
This commit is contained in:
Alexei Podtelezhnikov 2021-05-12 00:04:59 -04:00
parent b070264521
commit 8f43d324d1
2 changed files with 134 additions and 81 deletions

View File

@ -1,3 +1,19 @@
2021-05-07 Alexei Podtelezhnikov <apodtele@gmail.com>
[smooth] Faster bitmap sweeping.
Selecting the fill rule or checking the direct mode each time we call
`gray_hline' is sub-optimal. This effectively splits the direct mode
into a separate code path while inlining `gray_hline' and saving 5-7%
of rendering time.
* src/smooth/ftgrays.c (gray_hline): Eliminated in favor of...
(FT_FILL_RULE, FT_GRAY_SET): ... these new macros...
(gray_sweep): ... inlined here.
(gray_sweep_direct): New function that handles the direct span buffer.
(gray_TWorker): Remove the span buffer.
(gray_raster_render, gray_convert_glyph): Updated.
2021-05-10 Alexei Podtelezhnikov <apodtele@gmail.com>
* src/smooth/ftgrays.c (gray_hline): Simplify even-odd computations.

View File

@ -392,6 +392,42 @@ typedef ptrdiff_t FT_PtrDist;
( sizeof( long ) * FT_CHAR_BIT - PIXEL_BITS ) )
/* Scale area and apply fill rule to calculate the coverage byte. */
/* The top fill bit is used for the non-zero rule. The eighth */
/* fill bit is used for the even-odd rule. The higher coverage */
/* bytes are either clamped for the non-zero-rule or discarded */
/* later for the even-odd rule. */
#define FT_FILL_RULE( coverage, area, fill ) \
FT_BEGIN_STMNT \
coverage = (int)( area >> ( PIXEL_BITS * 2 + 1 - 8 ) ); \
if ( coverage & fill ) \
coverage = ~coverage; \
if ( coverage > 255 && fill & INT_MIN ) \
coverage = 255; \
FT_END_STMNT
/* It is faster to write small spans byte-by-byte than calling */
/* `memset'. This is mainly due to the cost of the function call. */
#define FT_GRAY_SET( d, s, count ) \
FT_BEGIN_STMNT \
unsigned char* q = d; \
unsigned char c = (unsigned char)s; \
switch ( count ) \
{ \
case 7: *q++ = c; /* fall through */ \
case 6: *q++ = c; /* fall through */ \
case 5: *q++ = c; /* fall through */ \
case 4: *q++ = c; /* fall through */ \
case 3: *q++ = c; /* fall through */ \
case 2: *q++ = c; /* fall through */ \
case 1: *q = c; /* fall through */ \
case 0: break; \
default: FT_MEM_SET( d, s, count ); \
} \
FT_END_STMNT
/**************************************************************************
*
* TYPE DEFINITIONS
@ -463,8 +499,6 @@ typedef ptrdiff_t FT_PtrDist;
FT_Raster_Span_Func render_span;
void* render_span_data;
FT_Span spans[FT_MAX_GRAY_SPANS];
int num_spans;
} gray_TWorker, *gray_PWorker;
@ -1171,94 +1205,62 @@ typedef ptrdiff_t FT_PtrDist;
static void
gray_hline( RAS_ARG_ TCoord x,
TCoord y,
TArea coverage,
TCoord acount )
gray_sweep( RAS_ARG )
{
/* scale the coverage from 0..(ONE_PIXEL*ONE_PIXEL*2) to 0..256 */
coverage >>= PIXEL_BITS * 2 + 1 - 8;
int fill = ras.outline.flags & FT_OUTLINE_EVEN_ODD_FILL ? 0x100 : INT_MIN;
int coverage;
int y;
/* compute the line's coverage depending on the outline fill rule */
if ( ras.outline.flags & FT_OUTLINE_EVEN_ODD_FILL )
for ( y = ras.min_ey; y < ras.max_ey; y++ )
{
if ( coverage & 256 ) /* odd bit */
coverage = ~coverage;
PCell cell = ras.ycells[y - ras.min_ey];
TCoord x = ras.min_ex;
TArea cover = 0;
TArea area;
/* higher bits discarded below */
}
else /* default non-zero winding rule */
{
if ( coverage < 0 )
coverage = ~coverage; /* the same as -coverage - 1 */
if ( coverage >= 256 )
coverage = 255;
}
if ( ras.num_spans >= 0 ) /* for FT_RASTER_FLAG_DIRECT only */
{
FT_Span* span = ras.spans + ras.num_spans++;
unsigned char* line = ras.target.origin - ras.target.pitch * y;
span->x = (short)x;
span->len = (unsigned short)acount;
span->coverage = (unsigned char)coverage;
if ( ras.num_spans == FT_MAX_GRAY_SPANS )
for ( ; cell != NULL; cell = cell->next )
{
/* flush the span buffer and reset the count */
ras.render_span( y, ras.num_spans, ras.spans, ras.render_span_data );
ras.num_spans = 0;
if ( cover != 0 && cell->x > x )
{
FT_FILL_RULE( coverage, cover, fill );
FT_GRAY_SET( line + x, coverage, cell->x - x );
}
cover += (TArea)cell->cover * ( ONE_PIXEL * 2 );
area = cover - cell->area;
if ( area != 0 && cell->x >= ras.min_ex )
{
FT_FILL_RULE( coverage, area, fill );
line[cell->x] = (unsigned char)coverage;
}
x = cell->x + 1;
}
}
else
{
unsigned char* q = ras.target.origin - ras.target.pitch * y + x;
unsigned char c = (unsigned char)coverage;
/* For small-spans it is faster to do it by ourselves than
* calling `memset'. This is mainly due to the cost of the
* function call.
*/
switch ( acount )
if ( cover != 0 ) /* only if cropped */
{
case 7:
*q++ = c;
/* fall through */
case 6:
*q++ = c;
/* fall through */
case 5:
*q++ = c;
/* fall through */
case 4:
*q++ = c;
/* fall through */
case 3:
*q++ = c;
/* fall through */
case 2:
*q++ = c;
/* fall through */
case 1:
*q = c;
/* fall through */
case 0:
break;
default:
FT_MEM_SET( q, c, acount );
FT_FILL_RULE( coverage, cover, fill );
FT_GRAY_SET( line + x, coverage, ras.max_ex - x );
}
}
}
static void
gray_sweep( RAS_ARG )
gray_sweep_direct( RAS_ARG )
{
int fill = ras.outline.flags & FT_OUTLINE_EVEN_ODD_FILL ? 0x100 : INT_MIN;
int coverage;
int y;
FT_Span span[FT_MAX_GRAY_SPANS];
int n = 0;
for ( y = ras.min_ey; y < ras.max_ey; y++ )
{
@ -1271,25 +1273,59 @@ typedef ptrdiff_t FT_PtrDist;
for ( ; cell != NULL; cell = cell->next )
{
if ( cover != 0 && cell->x > x )
gray_hline( RAS_VAR_ x, y, cover, cell->x - x );
{
FT_FILL_RULE( coverage, cover, fill );
span[n].coverage = (unsigned char)coverage;
span[n].x = (short)x;
span[n].len = (unsigned short)( cell->x - x );
if ( ++n == FT_MAX_GRAY_SPANS )
{
/* flush the span buffer and reset the count */
ras.render_span( y, n, span, ras.render_span_data );
n = 0;
}
}
cover += (TArea)cell->cover * ( ONE_PIXEL * 2 );
area = cover - cell->area;
if ( area != 0 && cell->x >= ras.min_ex )
gray_hline( RAS_VAR_ cell->x, y, area, 1 );
{
FT_FILL_RULE( coverage, area, fill );
span[n].coverage = (unsigned char)coverage;
span[n].x = (short)cell->x;
span[n].len = 1;
if ( ++n == FT_MAX_GRAY_SPANS )
{
/* flush the span buffer and reset the count */
ras.render_span( y, n, span, ras.render_span_data );
n = 0;
}
}
x = cell->x + 1;
}
if ( cover != 0 )
gray_hline( RAS_VAR_ x, y, cover, ras.max_ex - x );
if ( cover != 0 ) /* only if cropped */
{
FT_FILL_RULE( coverage, cover, fill );
if ( ras.num_spans > 0 ) /* for FT_RASTER_FLAG_DIRECT only */
span[n].coverage = (unsigned char)coverage;
span[n].x = (short)x;
span[n].len = (unsigned short)( ras.max_ex - x );
++n;
}
if ( n )
{
/* flush the span buffer and reset the count */
ras.render_span( y, ras.num_spans, ras.spans, ras.render_span_data );
ras.num_spans = 0;
ras.render_span( y, n, span, ras.render_span_data );
n = 0;
}
}
}
@ -1688,7 +1724,10 @@ typedef ptrdiff_t FT_PtrDist;
if ( !error )
{
gray_sweep( RAS_VAR );
if ( ras.render_span ) /* for FT_RASTER_FLAG_DIRECT only */
gray_sweep_direct( RAS_VAR );
else
gray_sweep( RAS_VAR );
band--;
continue;
}
@ -1757,7 +1796,6 @@ typedef ptrdiff_t FT_PtrDist;
ras.render_span = (FT_Raster_Span_Func)params->gray_spans;
ras.render_span_data = params->user;
ras.num_spans = 0;
ras.min_ex = params->clip_box.xMin;
ras.min_ey = params->clip_box.yMin;
@ -1787,7 +1825,6 @@ typedef ptrdiff_t FT_PtrDist;
ras.render_span = (FT_Raster_Span_Func)NULL;
ras.render_span_data = NULL;
ras.num_spans = -1; /* invalid */
ras.min_ex = 0;
ras.min_ey = 0;