diff --git a/dlls/xmllite/reader.c b/dlls/xmllite/reader.c index a880bee2566..699af4b2a10 100644 --- a/dlls/xmllite/reader.c +++ b/dlls/xmllite/reader.c @@ -1,7 +1,7 @@ /* * IXmlReader implementation * - * Copyright 2010, 2012-2013, 2016 Nikolay Sivov + * Copyright 2010, 2012-2013, 2016-2017 Nikolay Sivov * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -93,6 +93,8 @@ static const WCHAR gtW[] = {'>',0}; static const WCHAR commentW[] = {'<','!','-','-',0}; static const WCHAR piW[] = {'<','?',0}; +static BOOL is_namestartchar(WCHAR ch); + static const char *debugstr_nodetype(XmlNodeType nodetype) { static const char * const type_names[] = @@ -840,10 +842,9 @@ static inline BOOL readerinput_is_utf8(xmlreaderinput *readerinput) static HRESULT readerinput_detectencoding(xmlreaderinput *readerinput, xml_encoding *enc) { encoded_buffer *buffer = &readerinput->buffer->encoded; - static const WCHAR startW[] = {'<','?'}; - static const WCHAR commentW[] = {'<','!'}; static const char utf8bom[] = {0xef,0xbb,0xbf}; static const char utf16lebom[] = {0xff,0xfe}; + WCHAR *ptrW; *enc = XmlEncoding_Unknown; @@ -854,13 +855,17 @@ static HRESULT readerinput_detectencoding(xmlreaderinput *readerinput, xml_encod if (buffer->written <= 3) return MX_E_INPUTEND; } + ptrW = (WCHAR *)buffer->data; /* try start symbols if we have enough data to do that, input buffer should contain first chunk already */ if (readerinput_is_utf8(readerinput)) *enc = XmlEncoding_UTF8; - else if (!memcmp(buffer->data, startW, sizeof(startW)) || - !memcmp(buffer->data, commentW, sizeof(commentW))) - *enc = XmlEncoding_UTF16; + else if (*ptrW == '<') + { + ptrW++; + if (*ptrW == '?' || *ptrW == '!' || is_namestartchar(*ptrW)) + *enc = XmlEncoding_UTF16; + } /* try with BOM now */ else if (!memcmp(buffer->data, utf8bom, sizeof(utf8bom))) { @@ -2492,7 +2497,8 @@ static HRESULT reader_parse_nextnode(xmlreader *reader) /* try to detect encoding by BOM or data and set input code page */ hr = readerinput_detectencoding(reader->input, &enc); - TRACE("detected encoding %s, 0x%08x\n", debugstr_w(xml_encoding_map[enc].name), hr); + TRACE("detected encoding %s, 0x%08x\n", enc == XmlEncoding_Unknown ? "(unknown)" : + debugstr_w(xml_encoding_map[enc].name), hr); if (FAILED(hr)) return hr; /* always switch first time cause we have to put something in */ diff --git a/dlls/xmllite/tests/reader.c b/dlls/xmllite/tests/reader.c index f7f738e6820..fba73c9e09a 100644 --- a/dlls/xmllite/tests/reader.c +++ b/dlls/xmllite/tests/reader.c @@ -49,7 +49,7 @@ static void free_str(WCHAR *str) static const char xmldecl_full[] = "\xef\xbb\xbf\n"; static const char xmldecl_short[] = ""; -static IStream *create_stream_on_data(const char *data, int size) +static IStream *create_stream_on_data(const void *data, unsigned int size) { IStream *stream = NULL; HGLOBAL hglobal; @@ -2086,6 +2086,70 @@ static void test_read_charref(void) IStream_Release(stream); } +static void test_encoding_detection(void) +{ + static const struct encoding_testW + { + WCHAR text[16]; + } + encoding_testsW[] = + { + { { '<','?','p','i',' ','?','>',0 } }, + { { '<','!','-','-',' ','c','-','-','>',0 } }, + { { 0xfeff,'<','a','/','>',0 } }, + { { '<','a','/','>',0 } }, + }; + static const char *encoding_testsA[] = + { + "", + "", + "\xef\xbb\xbf", /* UTF-8 BOM */ + "", + }; + IXmlReader *reader; + XmlNodeType type; + IStream *stream; + unsigned int i; + HRESULT hr; + + hr = CreateXmlReader(&IID_IXmlReader, (void **)&reader, NULL); + ok(hr == S_OK, "S_OK, got %08x\n", hr); + + /* there's no way to query detected encoding back, so just verify that document is browsable */ + + for (i = 0; i < sizeof(encoding_testsA)/sizeof(encoding_testsA[0]); i++) + { + stream = create_stream_on_data(encoding_testsA[i], strlen(encoding_testsA[i])); + + hr = IXmlReader_SetInput(reader, (IUnknown *)stream); + ok(hr == S_OK, "got %08x\n", hr); + + type = XmlNodeType_None; + hr = IXmlReader_Read(reader, &type); + ok(hr == S_OK, "got %08x\n", hr); + ok(type != XmlNodeType_None, "Unexpected node type %d\n", type); + + IStream_Release(stream); + } + + for (i = 0; i < sizeof(encoding_testsW)/sizeof(encoding_testsW[0]); i++) + { + stream = create_stream_on_data(encoding_testsW[i].text, lstrlenW(encoding_testsW[i].text) * sizeof(WCHAR)); + + hr = IXmlReader_SetInput(reader, (IUnknown *)stream); + ok(hr == S_OK, "got %08x\n", hr); + + type = XmlNodeType_None; + hr = IXmlReader_Read(reader, &type); + ok(hr == S_OK, "%u: got %08x\n", i, hr); + ok(type != XmlNodeType_None, "%u: unexpected node type %d\n", i, type); + + IStream_Release(stream); + } + + IXmlReader_Release(reader); +} + START_TEST(reader) { test_reader_create(); @@ -2108,4 +2172,5 @@ START_TEST(reader) test_prefix(); test_namespaceuri(); test_read_charref(); + test_encoding_detection(); }