unicode: Add support for high Unicode planes in decomposition tables.

Signed-off-by: Alexandre Julliard <julliard@winehq.org>
This commit is contained in:
Alexandre Julliard 2020-02-17 10:44:04 +01:00
parent 148f564d10
commit c658731975
5 changed files with 4478 additions and 1695 deletions

View File

@ -6257,17 +6257,19 @@ static void test_NormalizeString(void)
ok( dstlen == (i < 2 ? 15 : 64), "%d: wrong len %d\n", i, dstlen );
SetLastError( 0xdeadbeef );
dstlen = pNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, ARRAY_SIZE(dst) );
todo_wine ok( dstlen == -3, "%d: wrong len %d\n", i, dstlen );
todo_wine ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() );
ok( dstlen == -3, "%d: wrong len %d\n", i, dstlen );
ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() );
dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, NULL, 0 );
ok( dstlen == (i < 2 ? 21 : 64), "%d: wrong len %d\n", i, dstlen );
SetLastError( 0xdeadbeef );
dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, dst, ARRAY_SIZE(dst) );
todo_wine ok( dstlen == -4, "%d: wrong len %d\n", i, dstlen );
todo_wine ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() );
ok( dstlen == -4, "%d: wrong len %d\n", i, dstlen );
ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() );
SetLastError( 0xdeadbeef );
dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, dst, 2 );
todo_wine ok( dstlen == (i < 2 ? -18 : -74), "%d: wrong len %d\n", i, dstlen );
todo_wine
ok( dstlen == (i < 2 ? -18 : -74), "%d: wrong len %d\n", i, dstlen );
todo_wine_if (i == 0 || i == 2)
ok( GetLastError() == ERROR_INSUFFICIENT_BUFFER, "%d: wrong error %d\n", i, GetLastError() );
if (pRtlNormalizeString)
{
@ -6277,17 +6279,18 @@ static void test_NormalizeString(void)
ok( dstlen == (i < 2 ? 15 : 64), "%d: wrong len %d\n", i, dstlen );
dstlen = ARRAY_SIZE(dst);
status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen );
todo_wine ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status );
todo_wine ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen );
ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status );
ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen );
dstlen = 1;
status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen );
todo_wine_if( i == 0 || i == 2)
ok( status == STATUS_BUFFER_TOO_SMALL, "%d: failed %x\n", i, status );
todo_wine_if (i != 3)
todo_wine_if( i != 3)
ok( dstlen == (i < 2 ? 14 : 73), "%d: wrong len %d\n", i, dstlen );
dstlen = 2;
status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen );
todo_wine ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status );
todo_wine ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen );
ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status );
ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen );
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -155,14 +155,14 @@ static WCHAR casemap_ascii( WCHAR ch )
}
static const WCHAR *get_decomposition( const unsigned short *table, WCHAR ch, unsigned int *len )
static const WCHAR *get_decomposition( const unsigned short *table, unsigned int ch, unsigned int *len )
{
unsigned short offset = table[table[ch >> 8] + ((ch >> 4) & 0xf)] + (ch & 0xf);
unsigned short start = table[offset];
unsigned short end = table[offset + 1];
if ((*len = end - start)) return table + start;
*len = 1;
*len = 1 + (ch >= 0x10000);
return NULL;
}
@ -174,13 +174,13 @@ static BYTE get_combining_class( unsigned int c )
}
static BOOL is_starter( WCHAR c )
static BOOL is_starter( unsigned int c )
{
return !get_combining_class( c );
}
static BOOL reorderable_pair( WCHAR c1, WCHAR c2 )
static BOOL reorderable_pair( unsigned int c1, unsigned int c2 )
{
BYTE ccc1, ccc2;
@ -191,23 +191,52 @@ static BOOL reorderable_pair( WCHAR c1, WCHAR c2 )
return ccc2 && (ccc1 > ccc2);
}
static int get_utf16( const WCHAR *src, unsigned int srclen, unsigned int *ch )
{
if (IS_HIGH_SURROGATE( src[0] ))
{
if (srclen <= 1) return 0;
if (!IS_LOW_SURROGATE( src[1] )) return 0;
*ch = 0x10000 + ((src[0] & 0x3ff) << 10) + (src[1] & 0x3ff);
return 2;
}
if (IS_LOW_SURROGATE( src[0] )) return 0;
*ch = src[0];
return 1;
}
static void put_utf16( WCHAR *dst, unsigned int ch )
{
if (ch >= 0x10000)
{
ch -= 0x10000;
dst[0] = 0xd800 | (ch >> 10);
dst[1] = 0xdc00 | (ch & 0x3ff);
}
else dst[0] = ch;
}
static void canonical_order_substring( WCHAR *str, unsigned int len )
{
unsigned int i;
unsigned int i, ch1, ch2, len1, len2;
BOOL swapped;
do
{
swapped = FALSE;
for (i = 0; i < len - 1; i++)
for (i = 0; i < len - 1; i += len1)
{
if (reorderable_pair( str[i], str[i + 1] ))
if (!(len1 = get_utf16( str + i, len - i, &ch1 ))) break;
if (i + len1 >= len) break;
if (!(len2 = get_utf16( str + i + len1, len - i - len1, &ch2 ))) break;
if (reorderable_pair( ch1, ch2 ))
{
WCHAR tmp = str[i];
str[i] = str[i + 1];
str[i + 1] = tmp;
WCHAR tmp[2];
memcpy( tmp, str + i, len1 * sizeof(WCHAR) );
memcpy( str + i, str + i + len1, len2 * sizeof(WCHAR) );
memcpy( str + i + len2, tmp, len1 * sizeof(WCHAR) );
swapped = TRUE;
i += len2 - len1;
}
}
} while (swapped);
@ -224,38 +253,43 @@ static void canonical_order_substring( WCHAR *str, unsigned int len )
*/
static void canonical_order_string( WCHAR *str, unsigned int len )
{
unsigned int i, next = 0;
unsigned int ch, i, r, next = 0;
for (i = 1; i <= len; i++)
for (i = 0; i < len; i += r)
{
if (i == len || is_starter( str[i] ))
if (!(r = get_utf16( str + i, len - i, &ch ))) return;
if (i && is_starter( ch ))
{
if (i > next + 1) /* at least two successive non-starters */
canonical_order_substring( str + next, i - next );
next = i + 1;
next = i + r;
}
}
if (i > next + 1) canonical_order_substring( str + next, i - next );
}
static NTSTATUS decompose_string( int compat, const WCHAR *src, int src_len, WCHAR *dst, int *dst_len )
{
const unsigned short *table = compat ? nfkd_table : nfd_table;
int src_pos, dst_pos = 0;
unsigned int decomp_len;
int src_pos, dst_pos;
unsigned int ch, len, decomp_len;
const WCHAR *decomp;
for (src_pos = 0; src_pos < src_len; src_pos++)
for (src_pos = dst_pos = 0; src_pos < src_len; src_pos += len, dst_pos += decomp_len)
{
if (dst_pos == *dst_len) break;
if ((decomp = get_decomposition( table, src[src_pos], &decomp_len )))
if (!(len = get_utf16( src + src_pos, src_len - src_pos, &ch )) ||
(ch >= 0xfdd0 && ch <= 0xfdef) || ((ch & 0xffff) >= 0xfffe))
{
if (dst_pos + decomp_len > *dst_len) break;
memcpy( dst + dst_pos, decomp, decomp_len * sizeof(WCHAR) );
*dst_len = src_pos + IS_HIGH_SURROGATE( src[src_pos] );
return STATUS_NO_UNICODE_TRANSLATION;
}
else dst[dst_pos] = src[src_pos];
dst_pos += decomp_len;
decomp = get_decomposition( table, ch, &decomp_len );
if (dst_pos + decomp_len > *dst_len) break;
if (decomp) memcpy( dst + dst_pos, decomp, decomp_len * sizeof(WCHAR) );
else put_utf16( dst + dst_pos, ch );
}
if (src_pos < src_len)
{
*dst_len += (src_len - src_pos) * (compat ? 18 : 3);
@ -1554,21 +1588,6 @@ NTSTATUS WINAPI RtlUTF8ToUnicodeN( WCHAR *dst, DWORD dstlen, DWORD *reslen, cons
}
/* get the next char value taking surrogates into account */
static inline unsigned int get_surrogate_value( const WCHAR *src, unsigned int srclen )
{
if (src[0] >= 0xd800 && src[0] <= 0xdfff) /* surrogate pair */
{
if (src[0] > 0xdbff || /* invalid high surrogate */
srclen <= 1 || /* missing low surrogate */
src[1] < 0xdc00 || src[1] > 0xdfff) /* invalid low surrogate */
return 0;
return 0x10000 + ((src[0] & 0x3ff) << 10) + (src[1] & 0x3ff);
}
return src[0];
}
/**************************************************************************
* RtlUnicodeToUTF8N (NTDLL.@)
*/
@ -1592,7 +1611,7 @@ NTSTATUS WINAPI RtlUnicodeToUTF8N( char *dst, DWORD dstlen, DWORD *reslen, const
else if (*src < 0x800) len += 2; /* 0x80-0x7ff: 2 bytes */
else
{
if (!(val = get_surrogate_value( src, srclen )))
if (!get_utf16( src, srclen, &val ))
{
val = 0xfffd;
status = STATUS_SOME_NOT_MAPPED;
@ -1629,7 +1648,7 @@ NTSTATUS WINAPI RtlUnicodeToUTF8N( char *dst, DWORD dstlen, DWORD *reslen, const
dst += 2;
continue;
}
if (!(val = get_surrogate_value( src, srclen )))
if (!get_utf16( src, srclen, &val ))
{
val = 0xfffd;
status = STATUS_SOME_NOT_MAPPED;

View File

@ -480,7 +480,7 @@ sub build_decompositions(@)
my @src = @_;
my @dst;
for (my $i = 0; $i < 65536; $i++)
for (my $i = 0; $i < @src; $i++)
{
next unless defined $src[$i];
my @decomp = get_decomposition( $i, \@src );
@ -2092,10 +2092,13 @@ sub dump_decompositions($@)
# first determine all the 16-char subsets that contain something
my @filled = (0) x 4096;
my $level1 = ($MAX_CHAR + 1) / 16;
my $level2 = $level1 / 16;
my @filled = (0) x $level1;
my $pos = 16; # for the null subset
my $data_total = 0;
for (my $i = 0; $i < 65536; $i++)
for (my $i = 0; $i <= $MAX_CHAR; $i++)
{
next unless defined $decomp[$i];
if ($filled[$i >> 4] == 0)
@ -2109,9 +2112,9 @@ sub dump_decompositions($@)
# now count the 256-char subsets that contain something
my @filled_idx = (256) x 256;
$pos = 256 + 16;
for (my $i = 0; $i < 4096; $i++)
my @filled_idx = ($level2) x $level2;
$pos = $level2 + 16;
for (my $i = 0; $i < $level1; $i++)
{
next unless $filled[$i];
$filled_idx[$i >> 4] = $pos;
@ -2123,7 +2126,7 @@ sub dump_decompositions($@)
# add the index offsets to the subsets positions
for (my $i = 0; $i < 4096; $i++)
for (my $i = 0; $i < $level1; $i++)
{
next unless $filled[$i];
$filled[$i] += $null_offset;
@ -2138,9 +2141,9 @@ sub dump_decompositions($@)
# dump the second-level indexes
for (my $i = 0; $i < 256; $i++)
for (my $i = 0; $i < $level2; $i++)
{
next unless ($filled_idx[$i] > 256);
next unless ($filled_idx[$i] > $level2);
my @table = @filled[($i<<4)..($i<<4)+15];
for (my $j = 0; $j < 16; $j++) { $table[$j] ||= $null_offset; }
printf OUTPUT ",\n /* sub-index %02x */\n", $i;
@ -2155,7 +2158,7 @@ sub dump_decompositions($@)
$pos = $total;
my @data;
for (my $i = 0; $i < 4096; $i++)
for (my $i = 0; $i < $level1; $i++)
{
next unless $filled[$i];
my @table = (0) x (16);