From 0aa7485b0d20987ce58c7343158cd73e81870920 Mon Sep 17 00:00:00 2001 From: Anurag Thakur Date: Sat, 19 Nov 2022 12:55:14 +0530 Subject: [PATCH] [dense] Migrate line drawing and accumulation to fixed-point * src/dense/ftdense.h: (FT26D6, FT20D12): New typedefs * src/dense/ftdense.c: dense_render_line, dense_render_glyph now use fixed-point numbers for calculation Disabled SIMD for now --- src/dense/ftdense.c | 179 ++++++++++++++++++++++++-------------------- src/dense/ftdense.h | 4 + 2 files changed, 102 insertions(+), 81 deletions(-) diff --git a/src/dense/ftdense.c b/src/dense/ftdense.c index a6b69f53e..c39d43e2a 100644 --- a/src/dense/ftdense.c +++ b/src/dense/ftdense.c @@ -81,57 +81,65 @@ dense_line_to( const FT_Vector* to, dense_worker* worker ) void dense_render_line( dense_worker* worker, FT_Pos tox, FT_Pos toy ) { - float from_x = worker->prev_x; - float from_y = worker->prev_y; - if ( from_y == toy ) + + FT26D6 fx = worker->prev_x>>2; + FT26D6 fy = worker->prev_y>>2; + + FT26D6 from_x = fx; + FT26D6 from_y = fy; + + + FT26D6 tx = tox>>2; + FT26D6 ty = toy>>2; + + if ( fy == ty ) return; + FT26D6 to_x = tx; + FT26D6 to_y = ty; - from_x /= 256.0; - from_y /= 256.0; - float to_x = tox / 256.0; - float to_y = toy / 256.0; - - - float dir; - if ( from_y < to_y ) - dir = 1; - else + int dir = 1; + if ( from_y >= to_y ) { dir = -1; - FT_SWAP(from_x, to_x ); - FT_SWAP(from_y, to_y ); + FT_SWAP(from_x, to_x); + FT_SWAP(from_y, to_y); } // Clip to the height. - if ( from_y >= worker->m_h || to_y <= 0 ) + if ( from_y >= worker->m_h<<6 || to_y <= 0 ) return; - float dxdy = ( to_x - from_x ) / (float)( to_y - from_y ); + FT26D6 deltax,deltay; + deltax = to_x - from_x; + deltay = to_y - from_y; + if ( from_y < 0 ) { - from_x -= from_y * dxdy; + from_x -= from_y * deltax/deltay; from_y = 0; } - if ( to_y > worker->m_h ) + + if ( to_y > worker->m_h<<6 ) { - to_x -= ( to_y - worker->m_h ) * dxdy; - to_y = (float)worker->m_h; + to_x -= (( to_y - worker->m_h<<6 ) * deltax/deltay); + to_y = worker->m_h<<6; } - float x = from_x; - int y0 = (int)from_y; - int y_limit = (int)ceil( to_y ); - float* m_a = worker->m_a; + int x = from_x; + int y0 = from_y>>6; + int y_limit = (to_y + 0x3f)>>6; + + FT20D12* m_a = worker->m_a; for ( int y = y0; y < y_limit; y++ ) { int linestart = y * worker->m_w; - float dy = fmin( y + 1.0f, to_y ) - fmax( (float)y, from_y ); - float xnext = x + dxdy * dy; - float d = dy * dir; + FT26D6 dy = FT_MIN( (y + 1)<<6, to_y ) - FT_MAX( y<<6, from_y ); + FT26D6 xnext = x + dy * deltax/deltay; + FT26D6 d = dy * dir; - float x0, x1; + FT26D6 x0, x1; if ( x < xnext ) { x0 = x; @@ -143,40 +151,48 @@ dense_render_line( dense_worker* worker, FT_Pos tox, FT_Pos toy ) x1 = x; } - /* - It's possible for x0 to be negative on the last scanline because of - floating-point inaccuracy That would cause an out-of-bounds array access at - index -1. - */ - float x0floor = x0 <= 0.0f ? 0.0f : (float)floor( x0 ); - int x0i = (int)x0floor; - float x1ceil = (float)ceil( x1 ); - int x1i = (int)x1ceil; + int x0i = x0>>6; + FT26D6 x0floor = x0i<<6; + + + int x1i = (x1+0x3f)>>6; + FT26D6 x1ceil = x1i <<6; + if ( x1i <= x0i + 1 ) { - float xmf = 0.5f * ( x + xnext ) - x0floor; - m_a[linestart + x0i] += d - d * xmf; + FT26D6 xmf = ( ( x + xnext )>>1) - x0floor; + m_a[linestart + x0i] += d * ((1<<6) - xmf); m_a[linestart + ( x0i + 1 )] += d * xmf; } else { - float s = 1.0f / ( x1 - x0 ); - float x0f = x0 - x0floor; - float a0 = 0.5f * s * ( 1.0f - x0f ) * ( 1.0f - x0f ); - float x1f = x1 - x1ceil + 1.0f; - float am = 0.5f * s * x1f * x1f; + + FT26D6 oneOverS = x1 - x0; + FT26D6 x0f = x0 - x0floor; + + + FT26D6 oneMinusX0f = (1<<6) - x0f; + FT26D6 a0 = ((oneMinusX0f * oneMinusX0f) >> 1) / oneOverS; + FT26D6 x1f = x1 - x1ceil + (1<<6); + FT26D6 am = ((x1f * x1f) >> 1) / oneOverS; + m_a[linestart + x0i] += d * a0; + if ( x1i == x0i + 2 ) - m_a[linestart + ( x0i + 1 )] += d * ( 1.0f - a0 - am ); + m_a[linestart + ( x0i + 1 )] += d * ( (1<<6) - a0 - am ); else { - float a1 = s * ( 1.5f - x0f ); + FT26D6 a1 = (((1<<6) + (1<<5) - x0f) << 6) / oneOverS; m_a[linestart + ( x0i + 1 )] += d * ( a1 - a0 ); - for ( int xi = x0i + 2; xi < x1i - 1; xi++ ) - m_a[linestart + xi] += d * s; - float a2 = a1 + ( x1i - x0i - 3 ) * s; - m_a[linestart + ( x1i - 1 )] += d * ( 1.0f - a2 - am ); + + FT26D6 dTimesS = (d << 12) / oneOverS; + + for ( FT26D6 xi = x0i + 2; xi < x1i - 1; xi++ ) + m_a[linestart + xi] += dTimesS; + + FT26D6 a2 = a1 + (( x1i - x0i - 3 )<<12)/oneOverS; + m_a[linestart + ( x1i - 1 )] += d * ( (1<<6) - a2 - am ); } m_a[linestart + x1i] += d * am; } @@ -364,48 +380,49 @@ dense_render_glyph( dense_worker* worker, const FT_Bitmap* target ) FT_Error error = FT_Outline_Decompose( &( worker->outline ), &dense_decompose_funcs, worker ); // Render into bitmap - const float* source = worker->m_a; + const FT20D12* source = worker->m_a; unsigned char* dest = target->buffer; unsigned char* dest_end = target->buffer + worker->m_w * worker->m_h; -#if FT_SSE4_1 +//#if FT_SSE4_1 - __m128 offset = _mm_setzero_ps(); - __m128i mask = _mm_set1_epi32(0x0c080400); - __m128 sign_mask = _mm_set1_ps(-0.f); - for (int i = 0; i < worker->m_h*worker->m_w; i += 4) { - __m128 x = _mm_load_ps(&source[i]); - x = _mm_add_ps(x, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(x), 4))); - x = _mm_add_ps(x, _mm_shuffle_ps(_mm_setzero_ps(), x, 0x40)); - x = _mm_add_ps(x, offset); - __m128 y = _mm_andnot_ps(sign_mask, x); // fabs(x) - y = _mm_min_ps(y, _mm_set1_ps(1.0f)); - y = _mm_mul_ps(y, _mm_set1_ps(255.0f)); - __m128i z = _mm_cvtps_epi32(y); - z = _mm_shuffle_epi8(z, mask); - _mm_store_ss((float *)&dest[i], (__m128)z); - offset = _mm_shuffle_ps(x, x, _MM_SHUFFLE(3, 3, 3, 3)); - } + // __m128 offset = _mm_setzero_ps(); + // __m128i mask = _mm_set1_epi32(0x0c080400); + // __m128 sign_mask = _mm_set1_ps(-0.f); + // for (int i = 0; i < worker->m_h*worker->m_w; i += 4) { + // __m128 x = _mm_load_ps(&source[i]); + // x = _mm_add_ps(x, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(x), 4))); + // x = _mm_add_ps(x, _mm_shuffle_ps(_mm_setzero_ps(), x, 0x40)); + // x = _mm_add_ps(x, offset); + // __m128 y = _mm_andnot_ps(sign_mask, x); // fabs(x) + // y = _mm_min_ps(y, _mm_set1_ps(1.0f)); + // y = _mm_mul_ps(y, _mm_set1_ps(255.0f)); + // __m128i z = _mm_cvtps_epi32(y); + // z = _mm_shuffle_epi8(z, mask); + // _mm_store_ss((float *)&dest[i], (__m128)z); + // offset = _mm_shuffle_ps(x, x, _MM_SHUFFLE(3, 3, 3, 3)); + // } -#else /* FT_SSE4_1 */ +//#else /* FT_SSE4_1 */ + + FT20D12 value = 0; - float value = 0.0f; while ( dest < dest_end ) { value += *source++; - if ( value > 0.0f ) - { - int n = (int)( fabs( value ) * 255.0f + 0.5f ); - if ( n > 255 ) - n = 255; + + if(value > 0){ + int n = value >>4; + + if(n>255)n=255; *dest = (unsigned char)n; - } - else + }else{ *dest = 0; + } dest++; } -#endif /* FT_SSE4_1 */ +//#endif /* FT_SSE4_1 */ free(worker->m_a); return error; @@ -444,10 +461,10 @@ dense_raster_render( FT_Raster raster, const FT_Raster_Params* params ) int size = worker->m_w * worker->m_h + 4; - worker->m_a = malloc( sizeof( float ) * size ); + worker->m_a = malloc( sizeof( FT20D12 ) * size ); worker->m_a_size = size; - memset( worker->m_a, 0, ( sizeof( float ) * size ) ); + memset( worker->m_a, 0, ( sizeof( FT20D12 ) * size ) ); /* exit if nothing to do */ if ( worker->m_w <= worker->m_origin_x || worker->m_h <= worker->m_origin_y ) { diff --git a/src/dense/ftdense.h b/src/dense/ftdense.h index 677b2ea33..0af2bb87f 100644 --- a/src/dense/ftdense.h +++ b/src/dense/ftdense.h @@ -19,6 +19,10 @@ extern "C" { #endif + + typedef signed long long FT26D6; /* 26.6 fixed-point representation */ + typedef signed long long FT20D12; /* 20.12 fixed-point representation */ + typedef struct { /** The array used to store signed area differences. */