wmc: Reimplement input format detection to correctly handle Unicode BOM.
Signed-off-by: Alexandre Julliard <julliard@winehq.org>
This commit is contained in:
parent
c2bd9deac8
commit
b7b44224d1
206
tools/wmc/mcl.c
206
tools/wmc/mcl.c
|
@ -160,14 +160,13 @@ void set_codepage(int cp)
|
||||||
/*
|
/*
|
||||||
* Input functions
|
* Input functions
|
||||||
*/
|
*/
|
||||||
|
#define INPUTBUFFER_SIZE 2048 /* Must be larger than 4 and approx. large enough to hold a line */
|
||||||
|
|
||||||
static int nungetstack = 0;
|
static int nungetstack = 0;
|
||||||
static int allocungetstack = 0;
|
static int allocungetstack = 0;
|
||||||
static char *ungetstack = NULL;
|
static char *ungetstack = NULL;
|
||||||
static int ninputbuffer = 0;
|
static int ninputbuffer = 0;
|
||||||
static WCHAR *inputbuffer = NULL;
|
static WCHAR inputbuffer[INPUTBUFFER_SIZE];
|
||||||
static char *xlatebuffer = NULL;
|
|
||||||
|
|
||||||
#define INPUTBUFFER_SIZE 2048 /* Must be larger than 4 and approx. large enough to hold a line */
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Fill the input buffer with *one* line of input.
|
* Fill the input buffer with *one* line of input.
|
||||||
|
@ -179,141 +178,74 @@ static char *xlatebuffer = NULL;
|
||||||
*/
|
*/
|
||||||
static int fill_inputbuffer(void)
|
static int fill_inputbuffer(void)
|
||||||
{
|
{
|
||||||
int n;
|
static enum input_mode { INPUT_UNKNOWN, INPUT_ASCII, INPUT_UTF8, INPUT_UNICODE } mode;
|
||||||
static const char err_fatalread[] = "Fatal: reading input failed";
|
static int swapped;
|
||||||
static int endian = -1;
|
static unsigned char utf8_bom[3] = { 0xef, 0xbb, 0xbf };
|
||||||
|
WCHAR *wbuf;
|
||||||
|
int i, pos = 0, len = 0;
|
||||||
|
char buffer[INPUTBUFFER_SIZE];
|
||||||
|
|
||||||
if(!inputbuffer)
|
if (mode == INPUT_UNKNOWN)
|
||||||
{
|
{
|
||||||
inputbuffer = xmalloc(INPUTBUFFER_SIZE*sizeof(WCHAR));
|
len = fread( buffer, 1, 8, yyin );
|
||||||
xlatebuffer = xmalloc(INPUTBUFFER_SIZE);
|
wbuf = (WCHAR *)buffer;
|
||||||
}
|
if (len >= 3 && !memcmp( buffer, utf8_bom, 3 ))
|
||||||
|
{
|
||||||
|
mode = INPUT_UTF8;
|
||||||
|
memmove( buffer, buffer + 3, len - 3 );
|
||||||
|
len -= 3;
|
||||||
|
}
|
||||||
|
else if (len == 8)
|
||||||
|
{
|
||||||
|
if (wbuf[0] == 0xfeff || wbuf[0] == 0xfffe)
|
||||||
|
{
|
||||||
|
mode = INPUT_UNICODE;
|
||||||
|
pos = 1;
|
||||||
|
swapped = (wbuf[0] == 0xfffe);
|
||||||
|
}
|
||||||
|
else if (!((wbuf[0] | wbuf[1] | wbuf[2] | wbuf[3]) & 0xff00))
|
||||||
|
{
|
||||||
|
mode = INPUT_UNICODE;
|
||||||
|
}
|
||||||
|
else if (!((wbuf[0] | wbuf[1] | wbuf[2] | wbuf[3]) & 0x00ff))
|
||||||
|
{
|
||||||
|
mode = INPUT_UNICODE;
|
||||||
|
swapped = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
try_again:
|
if (mode == INPUT_UNICODE)
|
||||||
if(!unicodein)
|
{
|
||||||
{
|
len = 4 - pos;
|
||||||
char *cptr;
|
memcpy( inputbuffer, wbuf + pos, len * sizeof(WCHAR) );
|
||||||
cptr = fgets(xlatebuffer, INPUTBUFFER_SIZE, yyin);
|
}
|
||||||
if(!cptr && ferror(yyin))
|
else if (mode == INPUT_UNKNOWN) mode = unicodein ? INPUT_UTF8 : INPUT_ASCII;
|
||||||
xyyerror(err_fatalread);
|
}
|
||||||
else if(!cptr)
|
|
||||||
return 0;
|
|
||||||
if (codepage == CP_UTF8)
|
|
||||||
{
|
|
||||||
WCHAR *buf = utf8_to_unicode( xlatebuffer, strlen(xlatebuffer), &n );
|
|
||||||
memcpy( inputbuffer, buf, (n + 1) * sizeof(WCHAR) );
|
|
||||||
free( buf );
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
n = wmc_mbstowcs(codepage, 0, xlatebuffer, strlen(xlatebuffer)+1, inputbuffer, INPUTBUFFER_SIZE);
|
|
||||||
if(n < 0)
|
|
||||||
internal_error(__FILE__, __LINE__, "Could not translate to unicode (%d)\n", n);
|
|
||||||
}
|
|
||||||
if(n <= 1)
|
|
||||||
goto try_again; /* Should not happen */
|
|
||||||
n--; /* Strip added conversion '\0' from input length */
|
|
||||||
/*
|
|
||||||
* FIXME:
|
|
||||||
* Detect UTF-8 in the first time we read some bytes by
|
|
||||||
* checking the special sequence "FE..." or something like
|
|
||||||
* that. I need to check www.unicode.org for details.
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if(endian == -1)
|
|
||||||
{
|
|
||||||
n = fread(inputbuffer, 1, 8, yyin);
|
|
||||||
if(n != 8)
|
|
||||||
{
|
|
||||||
if(!n && ferror(yyin))
|
|
||||||
xyyerror(err_fatalread);
|
|
||||||
else
|
|
||||||
xyyerror("Fatal: file too short to determine byteorder (should never happen)\n");
|
|
||||||
}
|
|
||||||
if(isisochar(inputbuffer[0]) &&
|
|
||||||
isisochar(inputbuffer[1]) &&
|
|
||||||
isisochar(inputbuffer[2]) &&
|
|
||||||
isisochar(inputbuffer[3]))
|
|
||||||
{
|
|
||||||
#ifdef WORDS_BIGENDIAN
|
|
||||||
endian = WMC_BO_BIG;
|
|
||||||
#else
|
|
||||||
endian = WMC_BO_LITTLE;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
else if(isisochar(BYTESWAP_WORD(inputbuffer[0])) &&
|
|
||||||
isisochar(BYTESWAP_WORD(inputbuffer[1])) &&
|
|
||||||
isisochar(BYTESWAP_WORD(inputbuffer[2])) &&
|
|
||||||
isisochar(BYTESWAP_WORD(inputbuffer[3])))
|
|
||||||
{
|
|
||||||
#ifdef WORDS_BIGENDIAN
|
|
||||||
endian = WMC_BO_LITTLE;
|
|
||||||
#else
|
|
||||||
endian = WMC_BO_BIG;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
else
|
|
||||||
xyyerror("Fatal: cannot determine file's byteorder\n");
|
|
||||||
/* FIXME:
|
|
||||||
* Determine the file-endian with the leader-bytes
|
|
||||||
* "FF FE..."; can't remember the exact sequence.
|
|
||||||
*/
|
|
||||||
n /= 2;
|
|
||||||
#ifdef WORDS_BIGENDIAN
|
|
||||||
if(endian == WMC_BO_LITTLE)
|
|
||||||
#else
|
|
||||||
if(endian == WMC_BO_BIG)
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
inputbuffer[0] = BYTESWAP_WORD(inputbuffer[0]);
|
|
||||||
inputbuffer[1] = BYTESWAP_WORD(inputbuffer[1]);
|
|
||||||
inputbuffer[2] = BYTESWAP_WORD(inputbuffer[2]);
|
|
||||||
inputbuffer[3] = BYTESWAP_WORD(inputbuffer[3]);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
switch (mode)
|
||||||
else
|
{
|
||||||
{
|
case INPUT_ASCII:
|
||||||
int i;
|
if (!fgets( buffer + len, sizeof(buffer) - len, yyin )) break;
|
||||||
n = 0;
|
ninputbuffer = wmc_mbstowcs( codepage, 0, buffer, strlen(buffer), inputbuffer, INPUTBUFFER_SIZE );
|
||||||
for(i = 0; i < INPUTBUFFER_SIZE; i++)
|
if (ninputbuffer < 0) internal_error(__FILE__, __LINE__, "Could not translate to unicode\n");
|
||||||
{
|
return 1;
|
||||||
int t;
|
case INPUT_UTF8:
|
||||||
t = fread(&inputbuffer[i], 2, 1, yyin);
|
if (!fgets( buffer + len, sizeof(buffer) - len, yyin )) break;
|
||||||
if(!t && ferror(yyin))
|
wbuf = utf8_to_unicode( buffer, strlen(buffer), &ninputbuffer );
|
||||||
xyyerror(err_fatalread);
|
memcpy( inputbuffer, wbuf, ninputbuffer * sizeof(WCHAR) );
|
||||||
else if(!t && n)
|
free( wbuf );
|
||||||
break;
|
return 1;
|
||||||
n++;
|
case INPUT_UNICODE:
|
||||||
#ifdef WORDS_BIGENDIAN
|
len += fread( inputbuffer + len, sizeof(WCHAR), INPUTBUFFER_SIZE - len, yyin );
|
||||||
if(endian == WMC_BO_LITTLE)
|
if (!len) break;
|
||||||
#else
|
if (swapped) for (i = 0; i < len; i++) inputbuffer[i] = BYTESWAP_WORD( inputbuffer[i] );
|
||||||
if(endian == WMC_BO_BIG)
|
ninputbuffer = len;
|
||||||
#endif
|
return 1;
|
||||||
{
|
case INPUT_UNKNOWN:
|
||||||
if((inputbuffer[i] = BYTESWAP_WORD(inputbuffer[i])) == '\n')
|
break;
|
||||||
break;
|
}
|
||||||
}
|
if (ferror(yyin)) xyyerror( "Fatal: reading input failed\n" );
|
||||||
else
|
return 0;
|
||||||
{
|
|
||||||
if(inputbuffer[i] == '\n')
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
if(!n)
|
|
||||||
{
|
|
||||||
mcy_warning("Re-read line (input was or converted to zilch)\n");
|
|
||||||
goto try_again; /* Should not happen, but could be due to stdin reading and a signal */
|
|
||||||
}
|
|
||||||
|
|
||||||
ninputbuffer += n;
|
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int get_unichar(void)
|
static int get_unichar(void)
|
||||||
|
@ -332,7 +264,7 @@ static int get_unichar(void)
|
||||||
}
|
}
|
||||||
|
|
||||||
ninputbuffer--;
|
ninputbuffer--;
|
||||||
return (int)(*b++ & 0xffff);
|
return *b++;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void unget_unichar(int ch)
|
static void unget_unichar(int ch)
|
||||||
|
|
Loading…
Reference in New Issue