xmllite/reader: Improve input stream encoding detection.
Signed-off-by: Nikolay Sivov <nsivov@codeweavers.com> Signed-off-by: Alexandre Julliard <julliard@winehq.org>
This commit is contained in:
parent
0aaade2cec
commit
80cf8838e3
|
@ -1,7 +1,7 @@
|
||||||
/*
|
/*
|
||||||
* IXmlReader implementation
|
* IXmlReader implementation
|
||||||
*
|
*
|
||||||
* Copyright 2010, 2012-2013, 2016 Nikolay Sivov
|
* Copyright 2010, 2012-2013, 2016-2017 Nikolay Sivov
|
||||||
*
|
*
|
||||||
* This library is free software; you can redistribute it and/or
|
* This library is free software; you can redistribute it and/or
|
||||||
* modify it under the terms of the GNU Lesser General Public
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
@ -93,6 +93,8 @@ static const WCHAR gtW[] = {'>',0};
|
||||||
static const WCHAR commentW[] = {'<','!','-','-',0};
|
static const WCHAR commentW[] = {'<','!','-','-',0};
|
||||||
static const WCHAR piW[] = {'<','?',0};
|
static const WCHAR piW[] = {'<','?',0};
|
||||||
|
|
||||||
|
static BOOL is_namestartchar(WCHAR ch);
|
||||||
|
|
||||||
static const char *debugstr_nodetype(XmlNodeType nodetype)
|
static const char *debugstr_nodetype(XmlNodeType nodetype)
|
||||||
{
|
{
|
||||||
static const char * const type_names[] =
|
static const char * const type_names[] =
|
||||||
|
@ -840,10 +842,9 @@ static inline BOOL readerinput_is_utf8(xmlreaderinput *readerinput)
|
||||||
static HRESULT readerinput_detectencoding(xmlreaderinput *readerinput, xml_encoding *enc)
|
static HRESULT readerinput_detectencoding(xmlreaderinput *readerinput, xml_encoding *enc)
|
||||||
{
|
{
|
||||||
encoded_buffer *buffer = &readerinput->buffer->encoded;
|
encoded_buffer *buffer = &readerinput->buffer->encoded;
|
||||||
static const WCHAR startW[] = {'<','?'};
|
|
||||||
static const WCHAR commentW[] = {'<','!'};
|
|
||||||
static const char utf8bom[] = {0xef,0xbb,0xbf};
|
static const char utf8bom[] = {0xef,0xbb,0xbf};
|
||||||
static const char utf16lebom[] = {0xff,0xfe};
|
static const char utf16lebom[] = {0xff,0xfe};
|
||||||
|
WCHAR *ptrW;
|
||||||
|
|
||||||
*enc = XmlEncoding_Unknown;
|
*enc = XmlEncoding_Unknown;
|
||||||
|
|
||||||
|
@ -854,13 +855,17 @@ static HRESULT readerinput_detectencoding(xmlreaderinput *readerinput, xml_encod
|
||||||
if (buffer->written <= 3) return MX_E_INPUTEND;
|
if (buffer->written <= 3) return MX_E_INPUTEND;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ptrW = (WCHAR *)buffer->data;
|
||||||
/* try start symbols if we have enough data to do that, input buffer should contain
|
/* try start symbols if we have enough data to do that, input buffer should contain
|
||||||
first chunk already */
|
first chunk already */
|
||||||
if (readerinput_is_utf8(readerinput))
|
if (readerinput_is_utf8(readerinput))
|
||||||
*enc = XmlEncoding_UTF8;
|
*enc = XmlEncoding_UTF8;
|
||||||
else if (!memcmp(buffer->data, startW, sizeof(startW)) ||
|
else if (*ptrW == '<')
|
||||||
!memcmp(buffer->data, commentW, sizeof(commentW)))
|
{
|
||||||
*enc = XmlEncoding_UTF16;
|
ptrW++;
|
||||||
|
if (*ptrW == '?' || *ptrW == '!' || is_namestartchar(*ptrW))
|
||||||
|
*enc = XmlEncoding_UTF16;
|
||||||
|
}
|
||||||
/* try with BOM now */
|
/* try with BOM now */
|
||||||
else if (!memcmp(buffer->data, utf8bom, sizeof(utf8bom)))
|
else if (!memcmp(buffer->data, utf8bom, sizeof(utf8bom)))
|
||||||
{
|
{
|
||||||
|
@ -2492,7 +2497,8 @@ static HRESULT reader_parse_nextnode(xmlreader *reader)
|
||||||
|
|
||||||
/* try to detect encoding by BOM or data and set input code page */
|
/* try to detect encoding by BOM or data and set input code page */
|
||||||
hr = readerinput_detectencoding(reader->input, &enc);
|
hr = readerinput_detectencoding(reader->input, &enc);
|
||||||
TRACE("detected encoding %s, 0x%08x\n", debugstr_w(xml_encoding_map[enc].name), hr);
|
TRACE("detected encoding %s, 0x%08x\n", enc == XmlEncoding_Unknown ? "(unknown)" :
|
||||||
|
debugstr_w(xml_encoding_map[enc].name), hr);
|
||||||
if (FAILED(hr)) return hr;
|
if (FAILED(hr)) return hr;
|
||||||
|
|
||||||
/* always switch first time cause we have to put something in */
|
/* always switch first time cause we have to put something in */
|
||||||
|
|
|
@ -49,7 +49,7 @@ static void free_str(WCHAR *str)
|
||||||
static const char xmldecl_full[] = "\xef\xbb\xbf<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n";
|
static const char xmldecl_full[] = "\xef\xbb\xbf<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n";
|
||||||
static const char xmldecl_short[] = "<?xml version=\"1.0\"?><RegistrationInfo/>";
|
static const char xmldecl_short[] = "<?xml version=\"1.0\"?><RegistrationInfo/>";
|
||||||
|
|
||||||
static IStream *create_stream_on_data(const char *data, int size)
|
static IStream *create_stream_on_data(const void *data, unsigned int size)
|
||||||
{
|
{
|
||||||
IStream *stream = NULL;
|
IStream *stream = NULL;
|
||||||
HGLOBAL hglobal;
|
HGLOBAL hglobal;
|
||||||
|
@ -2086,6 +2086,70 @@ static void test_read_charref(void)
|
||||||
IStream_Release(stream);
|
IStream_Release(stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void test_encoding_detection(void)
|
||||||
|
{
|
||||||
|
static const struct encoding_testW
|
||||||
|
{
|
||||||
|
WCHAR text[16];
|
||||||
|
}
|
||||||
|
encoding_testsW[] =
|
||||||
|
{
|
||||||
|
{ { '<','?','p','i',' ','?','>',0 } },
|
||||||
|
{ { '<','!','-','-',' ','c','-','-','>',0 } },
|
||||||
|
{ { 0xfeff,'<','a','/','>',0 } },
|
||||||
|
{ { '<','a','/','>',0 } },
|
||||||
|
};
|
||||||
|
static const char *encoding_testsA[] =
|
||||||
|
{
|
||||||
|
"<?pi ?>",
|
||||||
|
"<!-- comment -->",
|
||||||
|
"\xef\xbb\xbf<a/>", /* UTF-8 BOM */
|
||||||
|
"<a/>",
|
||||||
|
};
|
||||||
|
IXmlReader *reader;
|
||||||
|
XmlNodeType type;
|
||||||
|
IStream *stream;
|
||||||
|
unsigned int i;
|
||||||
|
HRESULT hr;
|
||||||
|
|
||||||
|
hr = CreateXmlReader(&IID_IXmlReader, (void **)&reader, NULL);
|
||||||
|
ok(hr == S_OK, "S_OK, got %08x\n", hr);
|
||||||
|
|
||||||
|
/* there's no way to query detected encoding back, so just verify that document is browsable */
|
||||||
|
|
||||||
|
for (i = 0; i < sizeof(encoding_testsA)/sizeof(encoding_testsA[0]); i++)
|
||||||
|
{
|
||||||
|
stream = create_stream_on_data(encoding_testsA[i], strlen(encoding_testsA[i]));
|
||||||
|
|
||||||
|
hr = IXmlReader_SetInput(reader, (IUnknown *)stream);
|
||||||
|
ok(hr == S_OK, "got %08x\n", hr);
|
||||||
|
|
||||||
|
type = XmlNodeType_None;
|
||||||
|
hr = IXmlReader_Read(reader, &type);
|
||||||
|
ok(hr == S_OK, "got %08x\n", hr);
|
||||||
|
ok(type != XmlNodeType_None, "Unexpected node type %d\n", type);
|
||||||
|
|
||||||
|
IStream_Release(stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < sizeof(encoding_testsW)/sizeof(encoding_testsW[0]); i++)
|
||||||
|
{
|
||||||
|
stream = create_stream_on_data(encoding_testsW[i].text, lstrlenW(encoding_testsW[i].text) * sizeof(WCHAR));
|
||||||
|
|
||||||
|
hr = IXmlReader_SetInput(reader, (IUnknown *)stream);
|
||||||
|
ok(hr == S_OK, "got %08x\n", hr);
|
||||||
|
|
||||||
|
type = XmlNodeType_None;
|
||||||
|
hr = IXmlReader_Read(reader, &type);
|
||||||
|
ok(hr == S_OK, "%u: got %08x\n", i, hr);
|
||||||
|
ok(type != XmlNodeType_None, "%u: unexpected node type %d\n", i, type);
|
||||||
|
|
||||||
|
IStream_Release(stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
IXmlReader_Release(reader);
|
||||||
|
}
|
||||||
|
|
||||||
START_TEST(reader)
|
START_TEST(reader)
|
||||||
{
|
{
|
||||||
test_reader_create();
|
test_reader_create();
|
||||||
|
@ -2108,4 +2172,5 @@ START_TEST(reader)
|
||||||
test_prefix();
|
test_prefix();
|
||||||
test_namespaceuri();
|
test_namespaceuri();
|
||||||
test_read_charref();
|
test_read_charref();
|
||||||
|
test_encoding_detection();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue