Aegisub/vsfilter/subtitles/Rasterizer.cpp

1088 lines
26 KiB
C++

/*
* Copyright (C) 2003-2006 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
#include "stdafx.h"
#include <string.h>
#include <math.h>
#include <vector>
#include <algorithm>
#include "Rasterizer.h"
Rasterizer::Rasterizer() : mpPathTypes(NULL), mpPathPoints(NULL), mPathPoints(0), mpOverlayBuffer(NULL)
{
mOverlayWidth = mOverlayHeight = 0;
mPathOffsetX = mPathOffsetY = 0;
mOffsetX = mOffsetY = 0;
}
Rasterizer::~Rasterizer()
{
_TrashPath();
_TrashOverlay();
}
void Rasterizer::_TrashPath()
{
delete [] mpPathTypes;
delete [] mpPathPoints;
mpPathTypes = NULL;
mpPathPoints = NULL;
mPathPoints = 0;
}
void Rasterizer::_TrashOverlay()
{
delete [] mpOverlayBuffer;
mpOverlayBuffer = NULL;
}
void Rasterizer::_ReallocEdgeBuffer(int edges)
{
mEdgeHeapSize = edges;
mpEdgeBuffer = (Edge*)realloc(mpEdgeBuffer, sizeof(Edge)*edges);
}
void Rasterizer::_EvaluateBezier(int ptbase, bool fBSpline)
{
const POINT* pt0 = mpPathPoints + ptbase;
const POINT* pt1 = mpPathPoints + ptbase + 1;
const POINT* pt2 = mpPathPoints + ptbase + 2;
const POINT* pt3 = mpPathPoints + ptbase + 3;
double x0 = pt0->x;
double x1 = pt1->x;
double x2 = pt2->x;
double x3 = pt3->x;
double y0 = pt0->y;
double y1 = pt1->y;
double y2 = pt2->y;
double y3 = pt3->y;
double cx3, cx2, cx1, cx0, cy3, cy2, cy1, cy0;
if(fBSpline)
{
// 1 [-1 +3 -3 +1]
// - * [+3 -6 +3 0]
// 6 [-3 0 +3 0]
// [+1 +4 +1 0]
double _1div6 = 1.0/6.0;
cx3 = _1div6*(- x0+3*x1-3*x2+x3);
cx2 = _1div6*( 3*x0-6*x1+3*x2);
cx1 = _1div6*(-3*x0 +3*x2);
cx0 = _1div6*( x0+4*x1+1*x2);
cy3 = _1div6*(- y0+3*y1-3*y2+y3);
cy2 = _1div6*( 3*y0-6*y1+3*y2);
cy1 = _1div6*(-3*y0 +3*y2);
cy0 = _1div6*( y0+4*y1+1*y2);
}
else // bezier
{
// [-1 +3 -3 +1]
// [+3 -6 +3 0]
// [-3 +3 0 0]
// [+1 0 0 0]
cx3 = - x0+3*x1-3*x2+x3;
cx2 = 3*x0-6*x1+3*x2;
cx1 = -3*x0+3*x1;
cx0 = x0;
cy3 = - y0+3*y1-3*y2+y3;
cy2 = 3*y0-6*y1+3*y2;
cy1 = -3*y0+3*y1;
cy0 = y0;
}
//
// This equation is from Graphics Gems I.
//
// The idea is that since we're approximating a cubic curve with lines,
// any error we incur is due to the curvature of the line, which we can
// estimate by calculating the maximum acceleration of the curve. For
// a cubic, the acceleration (second derivative) is a line, meaning that
// the absolute maximum acceleration must occur at either the beginning
// (|c2|) or the end (|c2+c3|). Our bounds here are a little more
// conservative than that, but that's okay.
//
// If the acceleration of the parametric formula is zero (c2 = c3 = 0),
// that component of the curve is linear and does not incur any error.
// If a=0 for both X and Y, the curve is a line segment and we can
// use a step size of 1.
double maxaccel1 = fabs(2*cy2) + fabs(6*cy3);
double maxaccel2 = fabs(2*cx2) + fabs(6*cx3);
double maxaccel = maxaccel1 > maxaccel2 ? maxaccel1 : maxaccel2;
double h = 1.0;
if(maxaccel > 8.0) h = sqrt(8.0 / maxaccel);
if(!fFirstSet) {firstp.x = (LONG)cx0; firstp.y = (LONG)cy0; lastp = firstp; fFirstSet = true;}
for(double t = 0; t < 1.0; t += h)
{
double x = cx0 + t*(cx1 + t*(cx2 + t*cx3));
double y = cy0 + t*(cy1 + t*(cy2 + t*cy3));
_EvaluateLine(lastp.x, lastp.y, (int)x, (int)y);
}
double x = cx0 + cx1 + cx2 + cx3;
double y = cy0 + cy1 + cy2 + cy3;
_EvaluateLine(lastp.x, lastp.y, (int)x, (int)y);
}
void Rasterizer::_EvaluateLine(int pt1idx, int pt2idx)
{
const POINT* pt1 = mpPathPoints + pt1idx;
const POINT* pt2 = mpPathPoints + pt2idx;
_EvaluateLine(pt1->x, pt1->y, pt2->x, pt2->y);
}
void Rasterizer::_EvaluateLine(int x0, int y0, int x1, int y1)
{
if(lastp.x != x0 || lastp.y != y0)
{
_EvaluateLine(lastp.x, lastp.y, x0, y0);
}
if(!fFirstSet) {firstp.x = x0; firstp.y = y0; fFirstSet = true;}
lastp.x = x1; lastp.y = y1;
if(y1 > y0) // down
{
__int64 xacc = (__int64)x0 << 13;
// prestep y0 down
int dy = y1 - y0;
int y = ((y0 + 3)&~7) + 4;
int iy = y >> 3;
y1 = (y1 - 5) >> 3;
if(iy <= y1)
{
__int64 invslope = (__int64(x1 - x0) << 16) / dy;
while(mEdgeNext + y1 + 1 - iy > mEdgeHeapSize)
_ReallocEdgeBuffer(mEdgeHeapSize*2);
xacc += (invslope * (y - y0)) >> 3;
while(iy <= y1)
{
int ix = (int)((xacc + 32768) >> 16);
mpEdgeBuffer[mEdgeNext].next = mpScanBuffer[iy];
mpEdgeBuffer[mEdgeNext].posandflag = ix*2 + 1;
mpScanBuffer[iy] = mEdgeNext++;
++iy;
xacc += invslope;
}
}
}
else if(y1 < y0) // up
{
__int64 xacc = (__int64)x1 << 13;
// prestep y1 down
int dy = y0 - y1;
int y = ((y1 + 3)&~7) + 4;
int iy = y >> 3;
y0 = (y0 - 5) >> 3;
if(iy <= y0)
{
__int64 invslope = (__int64(x0 - x1) << 16) / dy;
while(mEdgeNext + y0 + 1 - iy > mEdgeHeapSize)
_ReallocEdgeBuffer(mEdgeHeapSize*2);
xacc += (invslope * (y - y1)) >> 3;
while(iy <= y0)
{
int ix = (int)((xacc + 32768) >> 16);
mpEdgeBuffer[mEdgeNext].next = mpScanBuffer[iy];
mpEdgeBuffer[mEdgeNext].posandflag = ix*2;
mpScanBuffer[iy] = mEdgeNext++;
++iy;
xacc += invslope;
}
}
}
}
bool Rasterizer::BeginPath(HDC hdc)
{
_TrashPath();
return !!::BeginPath(hdc);
}
bool Rasterizer::EndPath(HDC hdc)
{
::CloseFigure(hdc);
if(::EndPath(hdc))
{
mPathPoints = GetPath(hdc, NULL, NULL, 0);
if(!mPathPoints)
return true;
mpPathTypes = (BYTE*)malloc(sizeof(BYTE) * mPathPoints);
mpPathPoints = (POINT*)malloc(sizeof(POINT) * mPathPoints);
if(mPathPoints == GetPath(hdc, mpPathPoints, mpPathTypes, mPathPoints))
return true;
}
::AbortPath(hdc);
return false;
}
bool Rasterizer::PartialBeginPath(HDC hdc, bool bClearPath)
{
if(bClearPath)
_TrashPath();
return !!::BeginPath(hdc);
}
bool Rasterizer::PartialEndPath(HDC hdc, long dx, long dy)
{
::CloseFigure(hdc);
if(::EndPath(hdc))
{
int nPoints;
BYTE* pNewTypes;
POINT* pNewPoints;
nPoints = GetPath(hdc, NULL, NULL, 0);
if(!nPoints)
return true;
pNewTypes = (BYTE*)realloc(mpPathTypes, (mPathPoints + nPoints) * sizeof(BYTE));
pNewPoints = (POINT*)realloc(mpPathPoints, (mPathPoints + nPoints) * sizeof(POINT));
if(pNewTypes)
mpPathTypes = pNewTypes;
if(pNewPoints)
mpPathPoints = pNewPoints;
BYTE* pTypes = new BYTE[nPoints];
POINT* pPoints = new POINT[nPoints];
if(pNewTypes && pNewPoints && nPoints == GetPath(hdc, pPoints, pTypes, nPoints))
{
for(int i = 0; i < nPoints; ++i)
{
mpPathPoints[mPathPoints + i].x = pPoints[i].x + dx;
mpPathPoints[mPathPoints + i].y = pPoints[i].y + dy;
mpPathTypes[mPathPoints + i] = pTypes[i];
}
mPathPoints += nPoints;
delete[] pTypes;
delete[] pPoints;
return true;
}
else
DebugBreak();
delete[] pTypes;
delete[] pPoints;
}
::AbortPath(hdc);
return false;
}
bool Rasterizer::ScanConvert()
{
int lastmoveto = -1;
int i;
// Drop any outlines we may have.
mOutline.clear();
mWideOutline.clear();
// Determine bounding box
if(!mPathPoints)
{
mPathOffsetX = mPathOffsetY = 0;
mWidth = mHeight = 0;
return 0;
}
int minx = INT_MAX;
int miny = INT_MAX;
int maxx = INT_MIN;
int maxy = INT_MIN;
for(i=0; i<mPathPoints; ++i)
{
int ix = mpPathPoints[i].x;
int iy = mpPathPoints[i].y;
if(ix < minx) minx = ix;
if(ix > maxx) maxx = ix;
if(iy < miny) miny = iy;
if(iy > maxy) maxy = iy;
}
minx = (minx >> 3) & ~7;
miny = (miny >> 3) & ~7;
maxx = (maxx + 7) >> 3;
maxy = (maxy + 7) >> 3;
for(i=0; i<mPathPoints; ++i)
{
mpPathPoints[i].x -= minx*8;
mpPathPoints[i].y -= miny*8;
}
if(minx > maxx || miny > maxy)
{
mWidth = mHeight = 0;
mPathOffsetX = mPathOffsetY = 0;
_TrashPath();
return true;
}
mWidth = maxx + 1 - minx;
mHeight = maxy + 1 - miny;
mPathOffsetX = minx;
mPathOffsetY = miny;
// Initialize edge buffer. We use edge 0 as a sentinel.
mEdgeNext = 1;
mEdgeHeapSize = 2048;
mpEdgeBuffer = (Edge*)malloc(sizeof(Edge)*mEdgeHeapSize);
// Initialize scanline list.
mpScanBuffer = new unsigned int[mHeight];
memset(mpScanBuffer, 0, mHeight*sizeof(unsigned int));
// Scan convert the outline. Yuck, Bezier curves....
// Unfortunately, Windows 95/98 GDI has a bad habit of giving us text
// paths with all but the first figure left open, so we can't rely
// on the PT_CLOSEFIGURE flag being used appropriately.
fFirstSet = false;
firstp.x = firstp.y = 0;
lastp.x = lastp.y = 0;
for(i=0; i<mPathPoints; ++i)
{
BYTE t = mpPathTypes[i] & ~PT_CLOSEFIGURE;
switch(t)
{
case PT_MOVETO:
if(lastmoveto >= 0 && firstp != lastp)
_EvaluateLine(lastp.x, lastp.y, firstp.x, firstp.y);
lastmoveto = i;
fFirstSet = false;
lastp = mpPathPoints[i];
break;
case PT_MOVETONC:
break;
case PT_LINETO:
if(mPathPoints - (i-1) >= 2) _EvaluateLine(i-1, i);
break;
case PT_BEZIERTO:
if(mPathPoints - (i-1) >= 4) _EvaluateBezier(i-1, false);
i += 2;
break;
case PT_BSPLINETO:
if(mPathPoints - (i-1) >= 4) _EvaluateBezier(i-1, true);
i += 2;
break;
case PT_BSPLINEPATCHTO:
if(mPathPoints - (i-3) >= 4) _EvaluateBezier(i-3, true);
break;
}
}
if(lastmoveto >= 0 && firstp != lastp)
_EvaluateLine(lastp.x, lastp.y, firstp.x, firstp.y);
// Free the path since we don't need it anymore.
_TrashPath();
// Convert the edges to spans. We couldn't do this before because some of
// the regions may have winding numbers >+1 and it would have been a pain
// to try to adjust the spans on the fly. We use one heap to detangle
// a scanline's worth of edges from the singly-linked lists, and another
// to collect the actual scans.
std::vector<int> heap;
mOutline.reserve(mEdgeNext / 2);
__int64 y = 0;
for(y=0; y<mHeight; ++y)
{
int count = 0;
// Detangle scanline into edge heap.
for(unsigned ptr = (unsigned)(mpScanBuffer[y]&0xffffffff); ptr; ptr = mpEdgeBuffer[ptr].next)
{
heap.push_back(mpEdgeBuffer[ptr].posandflag);
}
// Sort edge heap. Note that we conveniently made the opening edges
// one more than closing edges at the same spot, so we won't have any
// problems with abutting spans.
std::sort(heap.begin(), heap.end()/*begin() + heap.size()*/);
// Process edges and add spans. Since we only check for a non-zero
// winding number, it doesn't matter which way the outlines go!
std::vector<int>::iterator itX1 = heap.begin();
std::vector<int>::iterator itX2 = heap.end(); // begin() + heap.size();
int x1, x2;
for(; itX1 != itX2; ++itX1)
{
int x = *itX1;
if(!count)
x1 = (x>>1);
if(x&1)
++count;
else
--count;
if(!count)
{
x2 = (x>>1);
if(x2>x1)
mOutline.push_back(std::pair<__int64,__int64>((y<<32)+x1+0x4000000040000000i64, (y<<32)+x2+0x4000000040000000i64)); // G: damn Avery, this is evil! :)
}
}
heap.clear();
}
// Dump the edge and scan buffers, since we no longer need them.
free(mpEdgeBuffer);
delete [] mpScanBuffer;
// All done!
return true;
}
using namespace std;
// Overlap the subtitle with itself displaces (dx,dy) and (-dx,dy) pixels, conceptually.
// Actually, mark in the widened region buffer such that the normal region
// translated dy in the Y axis has its spans extended by dx pixels in both directions.
// If any spans overlap after this extension, they are merged.
// How the actual calculation is done I'm still not sure.
void Rasterizer::_OverlapRegion(tSpanBuffer& dst, tSpanBuffer& src, int dx, int dy)
{
tSpanBuffer temp;
temp.reserve(dst.size() + src.size());
dst.swap(temp);
tSpanBuffer::iterator itA = temp.begin();
tSpanBuffer::iterator itAE = temp.end();
tSpanBuffer::iterator itB = src.begin();
tSpanBuffer::iterator itBE = src.end();
// Don't worry -- even if dy<0 this will still work! // G: hehe, the evil twin :)
// This is where the X-axis is mirrored
unsigned __int64 offset1 = (((__int64)dy)<<32) - dx;
unsigned __int64 offset2 = (((__int64)dy)<<32) + dx;
while(itA != itAE && itB != itBE)
{
if((*itB).first + offset1 < (*itA).first)
{
// B span is earlier. Use it.
unsigned __int64 x1 = (*itB).first + offset1;
unsigned __int64 x2 = (*itB).second + offset2;
++itB;
// B spans don't overlap, so begin merge loop with A first.
for(;;)
{
// If we run out of A spans or the A span doesn't overlap,
// then the next B span can't either (because B spans don't
// overlap) and we exit.
if(itA == itAE || (*itA).first > x2)
break;
do {x2 = _MAX(x2, (*itA++).second);}
while(itA != itAE && (*itA).first <= x2);
// If we run out of B spans or the B span doesn't overlap,
// then the next A span can't either (because A spans don't
// overlap) and we exit.
if(itB == itBE || (*itB).first + offset1 > x2)
break;
do {x2 = _MAX(x2, (*itB++).second + offset2);}
while(itB != itBE && (*itB).first + offset1 <= x2);
}
// Flush span.
dst.push_back(tSpan(x1, x2));
}
else
{
// A span is earlier. Use it.
unsigned __int64 x1 = (*itA).first;
unsigned __int64 x2 = (*itA).second;
++itA;
// A spans don't overlap, so begin merge loop with B first.
for(;;)
{
// If we run out of B spans or the B span doesn't overlap,
// then the next A span can't either (because A spans don't
// overlap) and we exit.
if(itB == itBE || (*itB).first + offset1 > x2)
break;
do {x2 = _MAX(x2, (*itB++).second + offset2);}
while(itB != itBE && (*itB).first + offset1 <= x2);
// If we run out of A spans or the A span doesn't overlap,
// then the next B span can't either (because B spans don't
// overlap) and we exit.
if(itA == itAE || (*itA).first > x2)
break;
do {x2 = _MAX(x2, (*itA++).second);}
while(itA != itAE && (*itA).first <= x2);
}
// Flush span.
dst.push_back(tSpan(x1, x2));
}
}
// Copy over leftover spans.
while(itA != itAE)
dst.push_back(*itA++);
while(itB != itBE)
{
dst.push_back(tSpan((*itB).first + offset1, (*itB).second + offset2));
++itB;
}
}
bool Rasterizer::CreateWidenedRegion(int r)
{
if(r < 0) r = 0;
// Do a half circle.
// _OverlapRegion mirrors this so both halves are done.
for(int y = -r; y <= r; ++y)
{
int x = (int)(0.5 + sqrt(float(r*r - y*y)));
_OverlapRegion(mWideOutline, mOutline, x, y);
}
mWideBorder = r;
return true;
}
void Rasterizer::DeleteOutlines()
{
mWideOutline.clear();
mOutline.clear();
}
bool Rasterizer::Rasterize(int xsub, int ysub, bool fBlur)
{
_TrashOverlay();
if(!mWidth || !mHeight)
{
mOverlayWidth = mOverlayHeight = 0;
return true;
}
xsub &= 7;
ysub &= 7;
int width = mWidth + xsub;
int height = mHeight + ysub;
mOffsetX = mPathOffsetX - xsub;
mOffsetY = mPathOffsetY - ysub;
mWideBorder = (mWideBorder+7)&~7;
if(!mWideOutline.empty())
{
width += 2*mWideBorder;
height += 2*mWideBorder;
xsub += mWideBorder;
ysub += mWideBorder;
mOffsetX -= mWideBorder;
mOffsetY -= mWideBorder;
}
mOverlayWidth = ((width+7)>>3) + 1;
mOverlayHeight = ((height+7)>>3) + 1;
mpOverlayBuffer = new byte[2 * mOverlayWidth * mOverlayHeight];
memset(mpOverlayBuffer, 0, 2 * mOverlayWidth * mOverlayHeight);
// Are we doing a border?
tSpanBuffer* pOutline[2] = {&mOutline, &mWideOutline};
for(int i = countof(pOutline)-1; i >= 0; i--)
{
tSpanBuffer::iterator it = pOutline[i]->begin();
tSpanBuffer::iterator itEnd = pOutline[i]->end();
for(; it!=itEnd; ++it)
{
int y = (int)(((*it).first >> 32) - 0x40000000 + ysub);
int x1 = (int)(((*it).first & 0xffffffff) - 0x40000000 + xsub);
int x2 = (int)(((*it).second & 0xffffffff) - 0x40000000 + xsub);
if(x2 > x1)
{
int first = x1>>3;
int last = (x2-1)>>3;
byte* dst = mpOverlayBuffer + 2*(mOverlayWidth*(y>>3) + first) + i;
if(first == last)
*dst += x2-x1;
else
{
*dst += ((first+1)<<3) - x1;
dst += 2;
while(++first < last)
{
*dst += 0x08;
dst += 2;
}
*dst += x2 - (last<<3);
}
}
}
}
// If we're blurring, do a 3x3 box blur
// Can't do it on subpictures smaller than 3x3 pixels
if(fBlur && mOverlayWidth >= 3 && mOverlayHeight >= 3)
{
int pitch = mOverlayWidth*2;
byte* tmp = new byte[pitch*mOverlayHeight];
if(!tmp) return(false);
memcpy(tmp, mpOverlayBuffer, pitch*mOverlayHeight);
int border = !mWideOutline.empty() ? 1 : 0;
// This could be done in a separated way and win some speed
for(int j = 1; j < mOverlayHeight-1; j++)
{
byte* src = tmp + pitch*j + 2 + border;
byte* dst = mpOverlayBuffer + pitch*j + 2 + border;
for(int i = 1; i < mOverlayWidth-1; i++, src+=2, dst+=2)
{
*dst = (src[-2-pitch] + (src[-pitch]<<1) + src[+2-pitch]
+ (src[-2]<<1) + (src[0]<<2) + (src[+2]<<1)
+ src[-2+pitch] + (src[+pitch]<<1) + src[+2+pitch]) >> 4;
}
}
delete [] tmp;
}
return true;
}
///////////////////////////////////////////////////////////////////////////
static __forceinline void pixmix(DWORD *dst, DWORD color, DWORD alpha)
{
int a = (((alpha)*(color>>24))>>6)&0xff;
// Make sure both a and ia are in range 1..256 for the >>8 operations below to be correct
int ia = 256-a;
a+=1;
*dst = ((((*dst&0x00ff00ff)*ia + (color&0x00ff00ff)*a)&0xff00ff00)>>8)
| ((((*dst&0x0000ff00)*ia + (color&0x0000ff00)*a)&0x00ff0000)>>8)
| ((((*dst>>8)&0x00ff0000)*ia)&0xff000000);
}
static __forceinline void pixmix2(DWORD *dst, DWORD color, DWORD shapealpha, DWORD clipalpha)
{
int a = (((shapealpha)*(clipalpha)*(color>>24))>>12)&0xff;
int ia = 256-a;
a+=1;
*dst = ((((*dst&0x00ff00ff)*ia + (color&0x00ff00ff)*a)&0xff00ff00)>>8)
| ((((*dst&0x0000ff00)*ia + (color&0x0000ff00)*a)&0x00ff0000)>>8)
| ((((*dst>>8)&0x00ff0000)*ia)&0xff000000);
}
#include <xmmintrin.h>
#include <emmintrin.h>
static __forceinline void pixmix_sse2(DWORD* dst, DWORD color, DWORD alpha)
{
alpha = (((alpha) * (color>>24)) >> 6) & 0xff;
color &= 0xffffff;
__m128i zero = _mm_setzero_si128();
__m128i a = _mm_set1_epi32(((alpha+1) << 16) | (0x100 - alpha));
__m128i d = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst), zero);
__m128i s = _mm_unpacklo_epi8(_mm_cvtsi32_si128(color), zero);
__m128i r = _mm_unpacklo_epi16(d, s);
r = _mm_madd_epi16(r, a);
r = _mm_srli_epi32(r, 8);
r = _mm_packs_epi32(r, r);
r = _mm_packus_epi16(r, r);
*dst = (DWORD)_mm_cvtsi128_si32(r);
}
static __forceinline void pixmix2_sse2(DWORD* dst, DWORD color, DWORD shapealpha, DWORD clipalpha)
{
int alpha = (((shapealpha)*(clipalpha)*(color>>24))>>12)&0xff;
color &= 0xffffff;
__m128i zero = _mm_setzero_si128();
__m128i a = _mm_set1_epi32(((alpha+1) << 16) | (0x100 - alpha));
__m128i d = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst), zero);
__m128i s = _mm_unpacklo_epi8(_mm_cvtsi32_si128(color), zero);
__m128i r = _mm_unpacklo_epi16(d, s);
r = _mm_madd_epi16(r, a);
r = _mm_srli_epi32(r, 8);
r = _mm_packs_epi32(r, r);
r = _mm_packus_epi16(r, r);
*dst = (DWORD)_mm_cvtsi128_si32(r);
}
// For CPUID usage in Rasterizer::Draw
#include "../dsutil/vd.h"
static const __int64 _00ff00ff00ff00ff = 0x00ff00ff00ff00ffi64;
// Render a subpicture onto a surface.
// spd is the surface to render on.
// clipRect is a rectangular clip region to render inside.
// pAlphaMask is an alpha clipping mask.
// xsub and ysub ???
// switchpts seems to be an array of fill colours interlaced with coordinates.
// switchpts[i*2] contains a colour and switchpts[i*2+1] contains the coordinate to use that colour from
// fBody tells whether to render the body of the subs.
// fBorder tells whether to render the border of the subs.
CRect Rasterizer::Draw(SubPicDesc& spd, CRect& clipRect, byte* pAlphaMask, int xsub, int ysub, const long* switchpts, bool fBody, bool fBorder)
{
CRect bbox(0, 0, 0, 0);
if(!switchpts || !fBody && !fBorder) return(bbox);
// clip
// Limit drawn area to intersection of rendering surface and rectangular clip area
CRect r(0, 0, spd.w, spd.h);
r &= clipRect;
// Remember that all subtitle coordinates are specified in 1/8 pixels
// (x+4)>>3 rounds to nearest whole pixel.
// ??? What is xsub, ysub, mOffsetX and mOffsetY ?
int x = (xsub + mOffsetX + 4)>>3;
int y = (ysub + mOffsetY + 4)>>3;
int w = mOverlayWidth;
int h = mOverlayHeight;
int xo = 0, yo = 0;
// Again, limiting?
if(x < r.left) {xo = r.left-x; w -= r.left-x; x = r.left;}
if(y < r.top) {yo = r.top-y; h -= r.top-y; y = r.top;}
if(x+w > r.right) w = r.right-x;
if(y+h > r.bottom) h = r.bottom-y;
// Check if there's actually anything to render
if(w <= 0 || h <= 0) return(bbox);
bbox.SetRect(x, y, x+w, y+h);
bbox &= CRect(0, 0, spd.w, spd.h);
// draw
// The alpha bitmap of the subtitles?
const byte* src = mpOverlayBuffer + 2*(mOverlayWidth * yo + xo);
// s points to what the "body" to use is
// If we're rendering body fill and border, src+1 points to the array of
// widened regions which contain both border and fill in one.
const byte* s = fBorder ? (src+1) : src;
// The complex "vector clip mask" I think.
const byte* am = pAlphaMask + spd.w * y + x;
// How would this differ from src?
unsigned long* dst = (unsigned long *)((char *)spd.bits + spd.pitch * y) + x;
// Grab the first colour
unsigned long color = switchpts[0];
// CPUID from VDub
bool fSSE2 = !!(g_cpuid.m_flags & CCpuID::sse2);
// Every remaining line in the bitmap to be rendered...
while(h--)
{
// Basic case of no complex clipping mask
if(!pAlphaMask)
{
// If the first colour switching coordinate is at "infinite" we're
// never switching and can use some simpler code.
// ??? Is this optimisation really worth the extra readability issues it adds?
if(switchpts[1] == 0xffffffff)
{
// fBody is true if we're rendering a fill or a shadow.
if(fBody)
{
// Run over every pixel, overlaying the subtitles with the fill colour
if(fSSE2)
for(int wt=0; wt<w; ++wt)
// The <<6 is due to pixmix expecting the alpha parameter to be
// the multiplication of two 6-bit unsigned numbers but we
// only have one here. (No alpha mask.)
pixmix_sse2(&dst[wt], color, s[wt*2]);
else
for(int wt=0; wt<w; ++wt)
pixmix(&dst[wt], color, s[wt*2]);
}
// Not body, ie. something else (border, shadow, I guess)
else
{
if(fSSE2)
for(int wt=0; wt<w; ++wt)
// It would seems src (not s here?) contains two different
// bitmaps interlaced per pixel.
// So here's using the difference between those two.
// What if the difference underflows??
// I guess src[wt*2+1] is the widened region for border
// created by CreateWidenedRegion, and thus contains
// both the fill and the border, so subtracting the fill
// from that is always safe.
pixmix_sse2(&dst[wt], color, src[wt*2+1] - src[wt*2]);
else
for(int wt=0; wt<w; ++wt)
pixmix(&dst[wt], color, src[wt*2+1] - src[wt*2]);
}
}
// not (switchpts[1] == 0xffffffff)
else
{
// switchpts plays an important rule here
const long *sw = switchpts;
if(fBody)
{
if(fSSE2)
for(int wt=0; wt<w; ++wt)
{
// xo is the offset (usually negative) we have moved into the image
// So if we have passed the switchpoint (?) switch to another colour
// (So switchpts stores both colours *and* coordinates?)
if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
pixmix_sse2(&dst[wt], color, s[wt*2]);
}
else
for(int wt=0; wt<w; ++wt)
{
if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
pixmix(&dst[wt], color, s[wt*2]);
}
}
// Not body
else
{
if(fSSE2)
for(int wt=0; wt<w; ++wt)
{
if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
pixmix_sse2(&dst[wt], color, src[wt*2+1] - src[wt*2]);
}
else
for(int wt=0; wt<w; ++wt)
{
if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
pixmix(&dst[wt], color, src[wt*2+1] - src[wt*2]);
}
}
}
}
// Here we *do* have an alpha mask
else
{
if(switchpts[1] == 0xffffffff)
{
if(fBody)
{
if(fSSE2)
for(int wt=0; wt<w; ++wt)
// Both s and am contain 6-bit bitmaps of two different
// alpha masks; s is the subtitle shape and am is the
// clipping mask.
// Multiplying them together yields a 12-bit number.
// I think some imprecision is introduced here??
pixmix2_sse2(&dst[wt], color, s[wt*2], am[wt]);
else
for(int wt=0; wt<w; ++wt)
pixmix2(&dst[wt], color, s[wt*2], am[wt]);
}
else
{
if(fSSE2)
for(int wt=0; wt<w; ++wt)
pixmix2_sse2(&dst[wt], color, src[wt*2+1] - src[wt*2], am[wt]);
else
for(int wt=0; wt<w; ++wt)
pixmix2(&dst[wt], color, src[wt*2+1] - src[wt*2], am[wt]);
}
}
else
{
const long *sw = switchpts;
if(fBody)
{
if(fSSE2)
for(int wt=0; wt<w; ++wt)
{
if(wt+xo >= sw[1]) {
while(wt+xo >= sw[1])
sw += 2; color = sw[-2];
}
pixmix2_sse2(&dst[wt], color, s[wt*2], am[wt]);
}
else
for(int wt=0; wt<w; ++wt)
{
if(wt+xo >= sw[1]) {
while(wt+xo >= sw[1])
sw += 2; color = sw[-2];
}
pixmix2(&dst[wt], color, s[wt*2], am[wt]);
}
}
else
{
if(fSSE2)
for(int wt=0; wt<w; ++wt)
{
if(wt+xo >= sw[1]) {
while(wt+xo >= sw[1])
sw += 2; color = sw[-2];
}
pixmix2_sse2(&dst[wt], color, src[wt*2+1] - src[wt*2], am[wt]);
}
else
for(int wt=0; wt<w; ++wt)
{
if(wt+xo >= sw[1]) {
while(wt+xo >= sw[1])
sw += 2; color = sw[-2];
}
pixmix2(&dst[wt], color, src[wt*2+1] - src[wt*2], am[wt]);
}
}
}
}
// Step to next scanline
src += 2*mOverlayWidth;
s += 2*mOverlayWidth;
am += spd.w;
dst = (unsigned long *)((char *)dst + spd.pitch);
}
return bbox;
}