Fixed support for RTF documents using ANSI charset and added support

for multibyte charsets, so that BIG5 and UTF-8 RTF documents are
working now.
This commit is contained in:
Phil Krylov 2005-03-18 10:24:51 +00:00 committed by Alexandre Julliard
parent dbf222f3aa
commit 297cd99ca3
3 changed files with 204 additions and 134 deletions

View File

@ -396,23 +396,24 @@ void ME_RTFReadHook(RTF_Info *info) {
switch(info->rtfMajor) switch(info->rtfMajor)
{ {
case rtfBeginGroup: case rtfBeginGroup:
if (info->formatStackTop < maxCharFormatStack) { if (info->stackTop < maxStack) {
info->formatStack[info->formatStackTop].cbSize = sizeof(info->formatStack[0]); memcpy(&info->stack[info->stackTop].fmt, &info->style->fmt, sizeof(CHARFORMAT2W));
memcpy(&info->formatStack[info->formatStackTop], &info->style->fmt, sizeof(CHARFORMAT2W)); info->stack[info->stackTop].codePage = info->codePage;
info->codePageStack[info->formatStackTop] = info->codePage; info->stack[info->stackTop].unicodeLength = info->unicodeLength;
} }
info->formatStackTop++; info->stackTop++;
break; break;
case rtfEndGroup: case rtfEndGroup:
{ {
ME_Style *s; ME_Style *s;
RTFFlushOutputBuffer(info); RTFFlushOutputBuffer(info);
info->formatStackTop--; info->stackTop--;
/* FIXME too slow ? how come ? */ /* FIXME too slow ? how come ? */
s = ME_ApplyStyle(info->style, &info->formatStack[info->formatStackTop]); s = ME_ApplyStyle(info->style, &info->stack[info->stackTop].fmt);
ME_ReleaseStyle(info->style); ME_ReleaseStyle(info->style);
info->style = s; info->style = s;
info->codePage = info->codePageStack[info->formatStackTop]; info->codePage = info->stack[info->stackTop].codePage;
info->unicodeLength = info->stack[info->stackTop].unicodeLength;
break; break;
} }
} }
@ -476,6 +477,7 @@ static LRESULT ME_StreamIn(ME_TextEditor *editor, DWORD format, EDITSTREAM *stre
/* do the parsing */ /* do the parsing */
RTFRead(&parser); RTFRead(&parser);
RTFFlushOutputBuffer(&parser); RTFFlushOutputBuffer(&parser);
RTFDestroy(&parser);
style = parser.style; style = parser.style;
} }

View File

@ -67,12 +67,13 @@ static void LookupInit (void);
static void Lookup (RTF_Info *, char *); static void Lookup (RTF_Info *, char *);
static int Hash (char*); static int Hash (char*);
static void RTFOutputUnicodeString( RTF_Info *info, WCHAR *str, int len );
static void CharAttr(RTF_Info *info); static void CharAttr(RTF_Info *info);
static void CharSet(RTF_Info *info); static void CharSet(RTF_Info *info);
static void DocAttr(RTF_Info *info); static void DocAttr(RTF_Info *info);
static void RTFFlushCPOutputBuffer(RTF_Info *info);
static void RTFPutCodePageChar(RTF_Info *info, int c);
int _RTFGetChar(RTF_Info *info) int _RTFGetChar(RTF_Info *info)
{ {
@ -113,6 +114,58 @@ void RTFSetEditStream(RTF_Info *info, EDITSTREAM *es)
info->editstream.pfnCallback = es->pfnCallback; info->editstream.pfnCallback = es->pfnCallback;
} }
static void
RTFDestroyAttrs(RTF_Info *info)
{
RTFColor *cp;
RTFFont *fp;
RTFStyle *sp;
RTFStyleElt *eltList, *ep;
while (info->fontList != (RTFFont *) NULL)
{
fp = info->fontList->rtfNextFont;
RTFFree (info->fontList->rtfFName);
RTFFree ((char *) info->fontList);
info->fontList = fp;
}
while (info->colorList != (RTFColor *) NULL)
{
cp = info->colorList->rtfNextColor;
RTFFree ((char *) info->colorList);
info->colorList = cp;
}
while (info->styleList != (RTFStyle *) NULL)
{
sp = info->styleList->rtfNextStyle;
eltList = info->styleList->rtfSSEList;
while (eltList != (RTFStyleElt *) NULL)
{
ep = eltList->rtfNextSE;
RTFFree (eltList->rtfSEText);
RTFFree ((char *) eltList);
eltList = ep;
}
RTFFree (info->styleList->rtfSName);
RTFFree ((char *) info->styleList);
info->styleList = sp;
}
}
void
RTFDestroy(RTF_Info *info)
{
if (info->rtfTextBuf)
{
RTFFree(info->rtfTextBuf);
RTFFree(info->pushedTextBuf);
}
RTFDestroyAttrs(info);
RTFFree(info->cpOutputBuffer);
}
/* /*
* Initialize the reader. This may be called multiple times, * Initialize the reader. This may be called multiple times,
* to read multiple files. The only thing not reset is the input * to read multiple files. The only thing not reset is the input
@ -122,10 +175,6 @@ void RTFSetEditStream(RTF_Info *info, EDITSTREAM *es)
void RTFInit(RTF_Info *info) void RTFInit(RTF_Info *info)
{ {
int i; int i;
RTFColor *cp;
RTFFont *fp;
RTFStyle *sp;
RTFStyleElt *eltList, *ep;
TRACE("\n"); TRACE("\n");
@ -164,36 +213,10 @@ void RTFInit(RTF_Info *info)
/* dump old lists if necessary */ /* dump old lists if necessary */
while (info->fontList != (RTFFont *) NULL) RTFDestroyAttrs(info);
{
fp = info->fontList->rtfNextFont;
RTFFree (info->fontList->rtfFName);
RTFFree ((char *) info->fontList);
info->fontList = fp;
}
while (info->colorList != (RTFColor *) NULL)
{
cp = info->colorList->rtfNextColor;
RTFFree ((char *) info->colorList);
info->colorList = cp;
}
while (info->styleList != (RTFStyle *) NULL)
{
sp = info->styleList->rtfNextStyle;
eltList = info->styleList->rtfSSEList;
while (eltList != (RTFStyleElt *) NULL)
{
ep = eltList->rtfNextSE;
RTFFree (eltList->rtfSEText);
RTFFree ((char *) eltList);
eltList = ep;
}
RTFFree (info->styleList->rtfSName);
RTFFree ((char *) info->styleList);
info->styleList = sp;
}
info->ansiCodePage = 1252; /* Latin-1 */ info->ansiCodePage = 1252; /* Latin-1 */
info->unicodeLength = 1; /* \uc1 is the default */ info->unicodeLength = 1; /* \uc1 is the default */
info->codePage = info->ansiCodePage; info->codePage = info->ansiCodePage;
@ -205,6 +228,13 @@ void RTFInit(RTF_Info *info)
info->rtfLinePos = 0; info->rtfLinePos = 0;
info->prevChar = EOF; info->prevChar = EOF;
info->bumpLine = 0; info->bumpLine = 0;
info->dwCPOutputCount = 0;
if (!info->cpOutputBuffer)
{
info->dwMaxCPOutputCount = 0x1000;
info->cpOutputBuffer = RTFAlloc(info->dwMaxCPOutputCount);
}
} }
/* /*
@ -475,17 +505,6 @@ static void _RTFGetToken(RTF_Info *info)
} }
static WCHAR
RTFANSIToUnicode(RTF_Info *info, char c)
{
WCHAR buffer[2] = { 0, 0 };
/* TODO: Probably caching codepage conversion tables would be faster... */
MultiByteToWideChar(info->codePage, 0, &c, 1, buffer, 2);
return buffer[0];
}
static int static int
RTFCharSetToCodePage(RTF_Info *info, int charset) RTFCharSetToCodePage(RTF_Info *info, int charset)
{ {
@ -493,7 +512,7 @@ RTFCharSetToCodePage(RTF_Info *info, int charset)
{ {
case ANSI_CHARSET: case ANSI_CHARSET:
case DEFAULT_CHARSET: case DEFAULT_CHARSET:
return 0; return info->ansiCodePage;
case SYMBOL_CHARSET: case SYMBOL_CHARSET:
return CP_SYMBOL; return CP_SYMBOL;
case MAC_CHARSET: case MAC_CHARSET:
@ -603,10 +622,6 @@ static void _RTFGetToken2(RTF_Info *info)
else else
{ {
info->rtfClass = rtfText; info->rtfClass = rtfText;
if (c & 0x80)
info->rtfMajor = RTFANSIToUnicode(info, c);
else
info->rtfMajor = c; info->rtfMajor = c;
} }
return; return;
@ -632,7 +647,7 @@ static void _RTFGetToken2(RTF_Info *info)
{ {
/* should do isxdigit check! */ /* should do isxdigit check! */
info->rtfClass = rtfText; info->rtfClass = rtfText;
info->rtfMajor = RTFANSIToUnicode(info, RTFCharToHex (c) * 16 + RTFCharToHex (c2)); info->rtfMajor = RTFCharToHex (c) * 16 + RTFCharToHex (c2);
return; return;
} }
/* early eof, whoops (class is rtfUnknown) */ /* early eof, whoops (class is rtfUnknown) */
@ -1416,6 +1431,7 @@ static RTFKey rtfKey[] =
{ rtfCharAttr, rtfLanguage, "lang", 0 }, { rtfCharAttr, rtfLanguage, "lang", 0 },
/* this has disappeared from spec 1.2 */ /* this has disappeared from spec 1.2 */
{ rtfCharAttr, rtfGray, "gray", 0 }, { rtfCharAttr, rtfGray, "gray", 0 },
{ rtfCharAttr, rtfUnicodeLength, "uc", 0 },
/* /*
* Paragraph formatting attributes * Paragraph formatting attributes
@ -1704,9 +1720,9 @@ static RTFKey rtfKey[] =
{ rtfDocAttr, rtfRTLDoc, "rtldoc", 0 }, { rtfDocAttr, rtfRTLDoc, "rtldoc", 0 },
{ rtfDocAttr, rtfLTRDoc, "ltrdoc", 0 }, { rtfDocAttr, rtfLTRDoc, "ltrdoc", 0 },
{ rtfDocAttr, rtfAnsiCodePage, "ansicpg", 0 }, { rtfDocAttr, rtfAnsiCodePage, "ansicpg", 0 },
{ rtfDocAttr, rtfUnicodeLength, "uc", 0 }, { rtfDocAttr, rtfUTF8RTF, "urtf", 0 },
/* /*
* Style attributes * Style attributes
@ -2475,7 +2491,7 @@ static void TextClass (RTF_Info *info);
static void ControlClass (RTF_Info *info); static void ControlClass (RTF_Info *info);
static void Destination (RTF_Info *info); static void Destination (RTF_Info *info);
static void SpecialChar (RTF_Info *info); static void SpecialChar (RTF_Info *info);
static void PutLitChar (RTF_Info *info, int c); static void RTFPutUnicodeChar (RTF_Info *info, int c);
/* /*
* Initialize the writer. * Initialize the writer.
@ -2499,14 +2515,13 @@ BeginFile (RTF_Info *info )
} }
/* /*
* Write out a character. Seems to work for the default ANSI codepage, * Write out a character.
* contrary to TextClass_orig.
*/ */
static void static void
TextClass (RTF_Info *info) TextClass (RTF_Info *info)
{ {
PutLitChar (info, info->rtfMajor); RTFPutCodePageChar(info, info->rtfMajor);
} }
@ -2530,7 +2545,7 @@ ControlClass (RTF_Info *info)
DocAttr(info); DocAttr(info);
break; break;
case rtfSpecialChar: case rtfSpecialChar:
SpecialChar (info); SpecialChar (info);
break; break;
} }
} }
@ -2539,10 +2554,19 @@ ControlClass (RTF_Info *info)
static void static void
CharAttr(RTF_Info *info) CharAttr(RTF_Info *info)
{ {
RTFFont *font;
switch (info->rtfMinor) switch (info->rtfMinor)
{ {
case rtfFontNum: case rtfFontNum:
info->codePage = RTFGetFont(info, info->rtfParam)->rtfFCodePage; font = RTFGetFont(info, info->rtfParam);
if (font)
info->codePage = font->rtfFCodePage;
else
RTFMsg(info, "unknown font %d\n", info->rtfParam);
break;
case rtfUnicodeLength:
info->unicodeLength = info->rtfParam;
break; break;
} }
} }
@ -2591,8 +2615,8 @@ DocAttr(RTF_Info *info)
case rtfAnsiCodePage: case rtfAnsiCodePage:
info->ansiCodePage = info->rtfParam; info->ansiCodePage = info->rtfParam;
break; break;
case rtfUnicodeLength: case rtfUTF8RTF:
info->unicodeLength = info->rtfParam; info->ansiCodePage = CP_UTF8;
break; break;
} }
} }
@ -2616,23 +2640,20 @@ static void SpecialChar (RTF_Info *info)
break; break;
case rtfUnicode: case rtfUnicode:
{ {
WCHAR buf[2];
int i; int i;
buf[0] = info->rtfParam; RTFPutUnicodeChar(info, info->rtfParam);
buf[1] = 0;
RTFFlushOutputBuffer(info);
RTFOutputUnicodeString(info, buf, 1);
/* After \u we must skip number of character tokens set by \ucN */ /* After \u we must skip number of character tokens set by \ucN */
for (i = 0; i < info->unicodeLength; i++) for (i = 0; i < info->unicodeLength; i++)
{ {
RTFGetToken(info); RTFGetToken(info);
if (info->rtfClass != rtfText) if (info->rtfClass != rtfText)
{ {
ERR("The token behind \\u is not text, but (%d,%d,%d)\n", ERR("The token behind \\u is not text, but (%d,%d,%d)\n",
info->rtfClass, info->rtfMajor, info->rtfMinor); info->rtfClass, info->rtfMajor, info->rtfMinor);
RTFUngetToken(info); RTFUngetToken(info);
break;
} }
} }
break; break;
@ -2642,64 +2663,117 @@ static void SpecialChar (RTF_Info *info)
case rtfRow: case rtfRow:
case rtfLine: case rtfLine:
case rtfPar: case rtfPar:
PutLitChar (info, '\n'); RTFPutUnicodeChar (info, '\n');
break; break;
case rtfCell: case rtfCell:
PutLitChar (info, ' '); /* make sure cells are separated */ RTFPutUnicodeChar (info, ' '); /* make sure cells are separated */
break; break;
case rtfNoBrkSpace: case rtfNoBrkSpace:
PutLitChar (info, 0x00A0); RTFPutUnicodeChar (info, 0x00A0);
break; break;
case rtfTab: case rtfTab:
PutLitChar (info, '\t'); RTFPutUnicodeChar (info, '\t');
break; break;
case rtfNoBrkHyphen: case rtfNoBrkHyphen:
PutLitChar (info, 0x2011); RTFPutUnicodeChar (info, 0x2011);
break; break;
case rtfBullet: case rtfBullet:
PutLitChar (info, 0x2022); RTFPutUnicodeChar (info, 0x2022);
break; break;
case rtfEmDash: case rtfEmDash:
PutLitChar (info, 0x2014); RTFPutUnicodeChar (info, 0x2014);
break; break;
case rtfEnDash: case rtfEnDash:
PutLitChar (info, 0x2013); RTFPutUnicodeChar (info, 0x2013);
break; break;
case rtfLQuote: case rtfLQuote:
PutLitChar (info, 0x2018); RTFPutUnicodeChar (info, 0x2018);
break; break;
case rtfRQuote: case rtfRQuote:
PutLitChar (info, 0x2019); RTFPutUnicodeChar (info, 0x2019);
break; break;
case rtfLDblQuote: case rtfLDblQuote:
PutLitChar (info, 0x201C); RTFPutUnicodeChar (info, 0x201C);
break; break;
case rtfRDblQuote: case rtfRDblQuote:
PutLitChar (info, 0x201D); RTFPutUnicodeChar (info, 0x201D);
break; break;
} }
} }
static void PutLitChar (RTF_Info *info, int c) static void
RTFFlushUnicodeOutputBuffer(RTF_Info *info)
{ {
if( info->dwOutputCount >= ( sizeof info->OutputBuffer - 1 ) ) if (info->dwOutputCount)
RTFFlushOutputBuffer( info ); {
ME_InsertTextFromCursor(info->editor, 0, info->OutputBuffer,
info->dwOutputCount, info->style);
info->dwOutputCount = 0;
}
}
static void
RTFPutUnicodeString(RTF_Info *info, WCHAR *string, int length)
{
if (info->dwCPOutputCount)
RTFFlushCPOutputBuffer(info);
while (length)
{
int fit = min(length, sizeof(info->OutputBuffer) / sizeof(WCHAR) - info->dwOutputCount);
memmove(info->OutputBuffer + info->dwOutputCount, string, fit * sizeof(WCHAR));
if (fit == sizeof(info->OutputBuffer) / sizeof(WCHAR) - info->dwOutputCount)
RTFFlushUnicodeOutputBuffer(info);
else
info->dwOutputCount += fit;
length -= fit;
string += fit;
}
}
static void
RTFFlushCPOutputBuffer(RTF_Info *info)
{
int bufferMax = info->dwCPOutputCount * 2 * sizeof(WCHAR);
WCHAR *buffer = (WCHAR *)RTFAlloc(bufferMax);
int length;
length = MultiByteToWideChar(info->codePage, 0, info->cpOutputBuffer,
info->dwCPOutputCount, buffer, bufferMax);
info->dwCPOutputCount = 0;
RTFPutUnicodeString(info, buffer, length);
RTFFree((char *)buffer);
}
void
RTFFlushOutputBuffer(RTF_Info *info)
{
if (info->dwCPOutputCount)
RTFFlushCPOutputBuffer(info);
RTFFlushUnicodeOutputBuffer(info);
}
static void
RTFPutUnicodeChar(RTF_Info *info, int c)
{
if (info->dwCPOutputCount)
RTFFlushCPOutputBuffer(info);
if (info->dwOutputCount * sizeof(WCHAR) >= ( sizeof info->OutputBuffer - 1 ) )
RTFFlushUnicodeOutputBuffer( info );
info->OutputBuffer[info->dwOutputCount++] = c; info->OutputBuffer[info->dwOutputCount++] = c;
} }
static void RTFOutputUnicodeString( RTF_Info *info, WCHAR *str, int len ) static void
RTFPutCodePageChar(RTF_Info *info, int c)
{ {
assert(str[len] == '\0'); /* Use dynamic buffer here because it's the best way to handle
if (len) { * MBCS codepages without having to worry about partial chars */
ME_InsertTextFromCursor( info->editor, 0, str, len, info->style ); if (info->dwCPOutputCount >= info->dwMaxCPOutputCount)
} {
} info->dwMaxCPOutputCount *= 2;
info->cpOutputBuffer = RTFReAlloc(info->cpOutputBuffer, info->dwMaxCPOutputCount);
}
void RTFFlushOutputBuffer( RTF_Info *info ) info->cpOutputBuffer[info->dwCPOutputCount++] = c;
{
info->OutputBuffer[info->dwOutputCount] = 0;
RTFOutputUnicodeString(info, info->OutputBuffer, info->dwOutputCount);
info->dwOutputCount = 0;
} }

View File

@ -353,7 +353,7 @@
# define rtfRTLDoc 76 /* new in 1.10 */ # define rtfRTLDoc 76 /* new in 1.10 */
# define rtfLTRDoc 77 /* new in 1.10 */ # define rtfLTRDoc 77 /* new in 1.10 */
# define rtfAnsiCodePage 78 # define rtfAnsiCodePage 78
# define rtfUnicodeLength 79 # define rtfUTF8RTF 79
# define rtfSectAttr 9 # define rtfSectAttr 9
# define rtfSectDef 0 # define rtfSectDef 0
@ -595,6 +595,7 @@
# define rtfCharCharSet 33 /* new in 1.10 */ # define rtfCharCharSet 33 /* new in 1.10 */
# define rtfLanguage 34 # define rtfLanguage 34
# define rtfGray 35 # define rtfGray 35
# define rtfUnicodeLength 36
# define rtfPictAttr 13 # define rtfPictAttr 13
# define rtfMacQD 0 # define rtfMacQD 0
@ -932,20 +933,6 @@
# define rtfLangTurkish 0x041f # define rtfLangTurkish 0x041f
# define rtfLangUrdu 0x0420 # define rtfLangUrdu 0x0420
/*
* CharSet indices
*/
# define rtfCSGeneral 0 /* general (default) charset */
# define rtfCSSymbol 1 /* symbol charset */
/*
* Flags for auto-charset-processing. Both are on by default.
*/
# define rtfReadCharSet 0x01 /* auto-read charset files */
# define rtfSwitchCharSet 0x02 /* auto-switch charset maps */
/* /*
* Style types * Style types
*/ */
@ -1026,23 +1013,25 @@ struct RTFStyleElt
# define New(t) ((t *) RTFAlloc ((int) sizeof (t))) # define New(t) ((t *) RTFAlloc ((int) sizeof (t)))
/* maximum number of character values representable in a byte */ /* Parser stack size */
# define charSetSize 256 # define maxStack 32
/* charset stack size */
# define maxCSStack 10
/* character format stack size */
# define maxCharFormatStack 32
struct _RTF_Info; struct _RTF_Info;
typedef struct _RTF_Info RTF_Info; typedef struct _RTF_Info RTF_Info;
typedef void (*RTFFuncPtr) (RTF_Info *); /* generic function pointer */ typedef void (*RTFFuncPtr) (RTF_Info *); /* generic function pointer */
/* RTF parser stack element */
struct tagRTFState {
CHARFORMAT2W fmt;
int codePage;
int unicodeLength;
};
typedef struct tagRTFState RTFState;
struct _RTF_Info { struct _RTF_Info {
/* /*
* Public variables (listed in rtf.h) * Public variables (listed in rtf.h)
@ -1087,8 +1076,9 @@ struct _RTF_Info {
RTFColor *colorList; /* initialized to NULL */ RTFColor *colorList; /* initialized to NULL */
RTFStyle *styleList; RTFStyle *styleList;
int ansiCodePage; /* ANSI codepage used in conversion to Unicode */ int ansiCodePage; /* ANSI codepage used in conversion to Unicode */
int unicodeLength; /* The length of ANSI representation of Unicode characters */
/* Character attributes */
int unicodeLength; /* The length of ANSI representation of Unicode characters */
int codePage; /* Current codepage for text conversion */ int codePage; /* Current codepage for text conversion */
char *inputName; char *inputName;
@ -1118,9 +1108,12 @@ struct _RTF_Info {
DWORD dwOutputCount; DWORD dwOutputCount;
WCHAR OutputBuffer[0x1000]; WCHAR OutputBuffer[0x1000];
CHARFORMAT2W formatStack[maxCharFormatStack]; DWORD dwCPOutputCount;
int codePageStack[maxCharFormatStack]; DWORD dwMaxCPOutputCount;
int formatStackTop; char *cpOutputBuffer;
RTFState stack[maxStack];
int stackTop;
}; };
@ -1129,6 +1122,7 @@ struct _RTF_Info {
*/ */
void RTFInit (RTF_Info *); void RTFInit (RTF_Info *);
void RTFDestroy(RTF_Info *info);
void RTFSetInputName (RTF_Info *, char *); void RTFSetInputName (RTF_Info *, char *);
char *RTFGetInputName (RTF_Info *); char *RTFGetInputName (RTF_Info *);
void RTFSetOutputName (RTF_Info *, char *); void RTFSetOutputName (RTF_Info *, char *);