Added support for composite Unicode characters in MultiByteToWideChar

and WideCharToMultiByte.
This commit is contained in:
Alexandre Julliard 2000-12-29 03:56:06 +00:00
parent 441f874517
commit e709cdbae3
6 changed files with 1578 additions and 57 deletions

View File

@ -254,7 +254,6 @@ INT WINAPI MultiByteToWideChar( UINT page, DWORD flags, LPCSTR src, INT srclen,
if (srclen == -1) srclen = strlen(src) + 1;
if (flags & MB_COMPOSITE) FIXME("MB_COMPOSITE not supported\n");
if (flags & MB_USEGLYPHCHARS) FIXME("MB_USEGLYPHCHARS not supported\n");
switch(page)
@ -330,8 +329,6 @@ INT WINAPI WideCharToMultiByte( UINT page, DWORD flags, LPCWSTR src, INT srclen,
if (srclen == -1) srclen = strlenW(src) + 1;
/* if (flags & WC_COMPOSITECHECK) FIXME( "WC_COMPOSITECHECK (%lx) not supported\n", flags );*/
switch(page)
{
case CP_UTF7:

View File

@ -70,6 +70,7 @@ CODEPAGES = \
C_SRCS = \
casemap.c \
compose.c \
cptable.c \
mbtowc.c \
string.c \

1089
unicode/compose.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -166,6 +166,7 @@ $DEF_CHAR = ord '?';
READ_DEFAULTS();
DUMP_CASE_MAPPINGS();
DUMP_COMPOSE_TABLES();
DUMP_CTYPE_TABLES();
foreach $file (@allfiles) { HANDLE_FILE( @$file ); }
@ -185,6 +186,8 @@ sub READ_DEFAULTS
@toupper_table = ();
@category_table = ();
@direction_table = ();
@decomp_table = ();
@compose_table = ();
# first setup a few default mappings
@ -285,6 +288,12 @@ sub READ_DEFAULTS
# decomposition contains only char values without prefix -> use first char
$dst = hex $1;
$category_table[$src] |= $category_table[$dst];
# store decomposition if it contains two chars
if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
{
$decomp_table[$src] = [ hex $1, hex $2 ];
push @compose_table, [ hex $1, hex $2, $src ];
}
}
else
{
@ -465,7 +474,7 @@ sub DUMP_SBCS_TABLE
next unless defined $uni2cp[$i];
$filled[$i >> 8] = 1;
$subtables++;
$i = ($i & ~255) + 256;
$i |= 255;
}
# output all the subtables into a single array
@ -572,7 +581,7 @@ sub DUMP_DBCS_TABLE
next unless defined $uni2cp[$i];
$filled[$i >> 8] = 1;
$subtables++;
$i = ($i & ~255) + 256;
$i |= 255;
}
# output all the subtables into a single array
@ -669,7 +678,7 @@ sub DUMP_CASE_TABLE
next unless defined $table[$i];
$filled[$i >> 8] = $pos;
$pos += 256;
$i = ($i & ~255) + 256;
$i |= 255;
}
for ($i = 0; $i < 65536; $i++)
{
@ -737,6 +746,144 @@ sub DUMP_CTYPE_TABLES
close OUTPUT;
}
################################################################
# dump the char composition tables
sub DUMP_COMPOSE_TABLES
{
open OUTPUT,">compose.c" or die "Cannot create compose.c";
printf "Building compose.c\n";
printf OUTPUT "/* Unicode char composition */\n";
printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
printf OUTPUT "#include \"wine/unicode.h\"\n\n";
######### composition table
my @filled = ();
foreach $i (@compose_table)
{
my @comp = @$i;
push @{$filled[$comp[1]]}, [ $comp[0], $comp[2] ];
}
# count how many different second chars we have
for ($i = $count = 0; $i < 65536; $i++)
{
next unless defined $filled[$i];
$count++;
}
# build the table of second chars and offsets
my $pos = $count + 1;
for ($i = 0; $i < 65536; $i++)
{
next unless defined $filled[$i];
push @table, $i, $pos;
$pos += @{$filled[$i]};
}
# terminator with last position
push @table, 0, $pos;
printf OUTPUT "const WCHAR unicode_compose_table[0x%x] =\n{\n", 2*$pos;
printf OUTPUT " /* second chars + offsets */\n%s", DUMP_ARRAY( "0x%04x", 0, @table );
# build the table of first chars and mappings
for ($i = 0; $i < 65536; $i++)
{
next unless defined $filled[$i];
my @table = ();
my @list = sort { $a->[0] <=> $b->[0] } @{$filled[$i]};
for ($j = 0; $j <= $#list; $j++)
{
push @table, $list[$j][0], $list[$j][1];
}
printf OUTPUT ",\n /* 0x%04x */\n%s", $i, DUMP_ARRAY( "0x%04x", 0, @table );
}
printf OUTPUT "\n};\n\nconst unsigned int unicode_compose_table_size = %d;\n\n", $count;
######### decomposition table
# first determine all the 16-char subsets that contain something
my @filled = (0) x 4096;
my $pos = 16*2; # for the null subset
for ($i = 0; $i < 65536; $i++)
{
next unless defined $decomp_table[$i];
$filled[$i >> 4] = $pos;
$pos += 16*2;
$i |= 15;
}
my $total = $pos;
# now count the 256-char subsets that contain something
my @filled_idx = (256) x 256;
$pos = 256 + 16;
for ($i = 0; $i < 4096; $i++)
{
next unless $filled[$i];
$filled_idx[$i >> 4] = $pos;
$pos += 16;
$i |= 15;
}
my $null_offset = $pos; # null mapping
$total += $pos;
# add the index offsets to the subsets positions
for ($i = 0; $i < 4096; $i++)
{
next unless $filled[$i];
$filled[$i] += $null_offset;
}
# dump the main index
printf OUTPUT "const WCHAR unicode_decompose_table[%d] =\n", $total;
printf OUTPUT "{\n /* index */\n";
printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @filled_idx );
printf OUTPUT ",\n /* null sub-index */\n%s", DUMP_ARRAY( "0x%04x", 0, ($null_offset) x 16 );
# dump the second-level indexes
for ($i = 0; $i < 256; $i++)
{
next unless ($filled_idx[$i] > 256);
my @table = @filled[($i<<4)..($i<<4)+15];
for ($j = 0; $j < 16; $j++) { $table[$j] ||= $null_offset; }
printf OUTPUT ",\n /* sub-index %02x */\n", $i;
printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table );
}
# dump the 16-char subsets
printf OUTPUT ",\n /* null mapping */\n";
printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, (0) x 32 );
for ($i = 0; $i < 4096; $i++)
{
next unless $filled[$i];
my @table = (0) x 32;
for ($j = 0; $j < 16; $j++)
{
if (defined $decomp_table[($i<<4) + $j])
{
$table[2 * $j] = ${$decomp_table[($i << 4) + $j]}[0];
$table[2 * $j + 1] = ${$decomp_table[($i << 4) + $j]}[1];
}
}
printf OUTPUT ",\n /* 0x%03x0 .. 0x%03xf */\n", $i, $i;
printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table );
}
printf OUTPUT "\n};\n";
close OUTPUT;
}
################################################################
# read an input file and generate the corresponding .c file
sub HANDLE_FILE

View File

@ -9,6 +9,23 @@
#include "winnls.h"
#include "wine/unicode.h"
/* get the decomposition of a Unicode char */
static int get_decomposition( WCHAR src, WCHAR *dst, unsigned int dstlen )
{
extern const WCHAR unicode_decompose_table[];
const WCHAR *ptr = unicode_decompose_table;
int res;
*dst = src;
ptr = unicode_decompose_table + ptr[src >> 8];
ptr = unicode_decompose_table + ptr[(src >> 4) & 0x0f] + 2 * (src & 0x0f);
if (!*ptr) return 1;
if (dstlen <= 1) return 0;
/* apply the decomposition recursively to the first char */
if ((res = get_decomposition( *ptr, dst, dstlen-1 ))) dst[res++] = ptr[1];
return res;
}
/* check src string for invalid chars; return non-zero if invalid char found */
static inline int check_invalid_chars_sbcs( const struct sbcs_table *table,
const unsigned char *src, unsigned int srclen )
@ -70,6 +87,33 @@ static inline int mbstowcs_sbcs( const struct sbcs_table *table,
}
}
/* mbstowcs for single-byte code page with char decomposition */
static int mbstowcs_sbcs_decompose( const struct sbcs_table *table,
const unsigned char *src, unsigned int srclen,
WCHAR *dst, unsigned int dstlen )
{
const WCHAR * const cp2uni = table->cp2uni;
unsigned int len;
if (!dstlen) /* compute length */
{
WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
for (len = 0; srclen; srclen--, src++)
len += get_decomposition( cp2uni[*src], dummy, 4 );
return len;
}
for (len = dstlen; srclen && len; srclen--, src++)
{
int res = get_decomposition( cp2uni[*src], dst, len );
if (!res) break;
len -= res;
dst += res;
}
if (srclen) return -1; /* overflow */
return dstlen - len;
}
/* query necessary dst length for src string */
static inline int get_length_dbcs( const struct dbcs_table *table,
const unsigned char *src, unsigned int srclen )
@ -122,7 +166,9 @@ static inline int mbstowcs_dbcs( const struct dbcs_table *table,
{
const WCHAR * const cp2uni = table->cp2uni;
const unsigned char * const cp2uni_lb = table->cp2uni_leadbytes;
int len;
unsigned int len;
if (!dstlen) return get_length_dbcs( table, src, srclen );
for (len = dstlen; srclen && len; len--, srclen--, src++, dst++)
{
@ -140,6 +186,54 @@ static inline int mbstowcs_dbcs( const struct dbcs_table *table,
}
/* mbstowcs for double-byte code page with character decomposition */
static int mbstowcs_dbcs_decompose( const struct dbcs_table *table,
const unsigned char *src, unsigned int srclen,
WCHAR *dst, unsigned int dstlen )
{
const WCHAR * const cp2uni = table->cp2uni;
const unsigned char * const cp2uni_lb = table->cp2uni_leadbytes;
unsigned int len;
WCHAR ch;
int res;
if (!dstlen) /* compute length */
{
WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
for (len = 0; srclen; srclen--, src++)
{
unsigned char off = cp2uni_lb[*src];
if (off)
{
if (!--srclen) break; /* partial char, ignore it */
src++;
ch = cp2uni[(off << 8) + *src];
}
else ch = cp2uni[*src];
len += get_decomposition( ch, dummy, 4 );
}
return len;
}
for (len = dstlen; srclen && len; srclen--, src++)
{
unsigned char off = cp2uni_lb[*src];
if (off)
{
if (!--srclen) break; /* partial char, ignore it */
src++;
ch = cp2uni[(off << 8) + *src];
}
else ch = cp2uni[*src];
if (!(res = get_decomposition( ch, dst, len ))) break;
dst += res;
len -= res;
}
if (srclen) return -1; /* overflow */
return dstlen - len;
}
/* return -1 on dst buffer overflow, -2 on invalid input char */
int cp_mbstowcs( const union cptable *table, int flags,
const char *src, int srclen,
@ -151,16 +245,22 @@ int cp_mbstowcs( const union cptable *table, int flags,
{
if (check_invalid_chars_sbcs( &table->sbcs, src, srclen )) return -2;
}
if (!(flags & MB_COMPOSITE))
{
if (!dstlen) return srclen;
return mbstowcs_sbcs( &table->sbcs, src, srclen, dst, dstlen );
}
return mbstowcs_sbcs_decompose( &table->sbcs, src, srclen, dst, dstlen );
}
else /* mbcs */
{
if (flags & MB_ERR_INVALID_CHARS)
{
if (check_invalid_chars_dbcs( &table->dbcs, src, srclen )) return -2;
}
if (!dstlen) return get_length_dbcs( &table->dbcs, src, srclen );
if (!(flags & MB_COMPOSITE))
return mbstowcs_dbcs( &table->dbcs, src, srclen, dst, dstlen );
else
return mbstowcs_dbcs_decompose( &table->dbcs, src, srclen, dst, dstlen );
}
}

View File

@ -9,6 +9,90 @@
#include "winnls.h"
#include "wine/unicode.h"
/* search for a character in the unicode_compose_table; helper for compose() */
static inline int binary_search( WCHAR ch, int low, int high )
{
extern const WCHAR unicode_compose_table[];
while (low <= high)
{
int pos = (low + high) / 2;
if (unicode_compose_table[2*pos] < ch)
{
low = pos + 1;
continue;
}
if (unicode_compose_table[2*pos] > ch)
{
high = pos - 1;
continue;
}
return pos;
}
return -1;
}
/* return the result of the composition of two Unicode chars, or 0 if none */
static WCHAR compose( const WCHAR *str )
{
extern const WCHAR unicode_compose_table[];
extern const unsigned int unicode_compose_table_size;
int idx = 1, low = 0, high = unicode_compose_table_size - 1;
for (;;)
{
int pos = binary_search( str[idx], low, high );
if (pos == -1) return 0;
if (!idx--) return unicode_compose_table[2*pos+1];
low = unicode_compose_table[2*pos+1];
high = unicode_compose_table[2*pos+3] - 1;
}
}
/****************************************************************/
/* sbcs support */
/* check if 'ch' is an acceptable sbcs mapping for 'wch' */
static inline int is_valid_sbcs_mapping( const struct sbcs_table *table, int flags,
WCHAR wch, unsigned char ch )
{
if (flags & WC_NO_BEST_FIT_CHARS) return (table->cp2uni[ch] == wch);
if (ch != (unsigned char)table->info.def_char) return 1;
return (wch == table->info.def_unicode_char);
}
/* query necessary dst length for src string */
static inline int get_length_sbcs( const struct sbcs_table *table, int flags,
const WCHAR *src, unsigned int srclen )
{
unsigned int ret = srclen;
if (flags & WC_COMPOSITECHECK)
{
const unsigned char * const uni2cp_low = table->uni2cp_low;
const unsigned short * const uni2cp_high = table->uni2cp_high;
WCHAR composed;
for (ret = 0; srclen > 1; ret++, srclen--, src++)
{
if (!(composed = compose(src))) continue;
/* check if we should skip the next char */
/* in WC_DEFAULTCHAR and WC_DISCARDNS mode, we always skip */
/* the next char no matter if the composition is valid or not */
if (!(flags & (WC_DEFAULTCHAR|WC_DISCARDNS)))
{
unsigned char ch = uni2cp_low[uni2cp_high[composed >> 8] + (composed & 0xff)];
if (!is_valid_sbcs_mapping( table, flags, composed, ch )) continue;
}
src++;
srclen--;
}
if (srclen) ret++; /* last char */
}
return ret;
}
/* wcstombs for single-byte code page */
static inline int wcstombs_sbcs( const struct sbcs_table *table,
const WCHAR *src, unsigned int srclen,
@ -61,46 +145,94 @@ static int wcstombs_sbcs_slow( const struct sbcs_table *table, int flags,
char *dst, unsigned int dstlen,
const char *defchar, int *used )
{
const WCHAR * const cp2uni = table->cp2uni;
const unsigned char * const uni2cp_low = table->uni2cp_low;
const unsigned short * const uni2cp_high = table->uni2cp_high;
const unsigned char table_default = table->info.def_char & 0xff;
int ret = srclen, tmp;
if (dstlen < srclen)
{
/* buffer too small: fill it up to dstlen and return error */
srclen = dstlen;
ret = -1;
}
unsigned int len;
int tmp;
WCHAR composed;
if (!defchar) defchar = &table_default;
if (!used) used = &tmp; /* avoid checking on every char */
while (srclen)
for (len = dstlen; srclen && len; dst++, len--, src++, srclen--)
{
unsigned char ch = uni2cp_low[uni2cp_high[*src >> 8] + (*src & 0xff)];
if (((flags & WC_NO_BEST_FIT_CHARS) && (cp2uni[ch] != *src)) ||
(ch == table_default && *src != table->info.def_unicode_char))
WCHAR wch = *src;
if ((flags & WC_COMPOSITECHECK) && (srclen > 1) && (composed = compose(src)))
{
ch = *defchar;
*used = 1;
/* now check if we can use the composed char */
*dst = uni2cp_low[uni2cp_high[composed >> 8] + (composed & 0xff)];
if (is_valid_sbcs_mapping( table, flags, composed, *dst ))
{
/* we have a good mapping, use it */
src++;
srclen--;
continue;
}
*dst++ = ch;
/* no mapping for the composed char, check the other flags */
if (flags & WC_DEFAULTCHAR) /* use the default char instead */
{
*dst = *defchar;
*used = 1;
src++; /* skip the non-spacing char */
srclen--;
continue;
}
if (flags & WC_DISCARDNS) /* skip the second char of the composition */
{
src++;
srclen--;
}
return ret;
/* WC_SEPCHARS is the default */
}
*dst = uni2cp_low[uni2cp_high[wch >> 8] + (wch & 0xff)];
if (!is_valid_sbcs_mapping( table, flags, wch, *dst ))
{
*dst = *defchar;
*used = 1;
}
}
if (srclen) return -1; /* overflow */
return dstlen - len;
}
/****************************************************************/
/* dbcs support */
/* check if 'ch' is an acceptable dbcs mapping for 'wch' */
static inline int is_valid_dbcs_mapping( const struct dbcs_table *table, int flags,
WCHAR wch, unsigned short ch )
{
if (ch == table->info.def_char && wch != table->info.def_unicode_char) return 0;
if (flags & WC_NO_BEST_FIT_CHARS)
{
/* check if char maps back to the same Unicode value */
if (ch & 0xff00)
{
unsigned char off = table->cp2uni_leadbytes[ch >> 8];
return (table->cp2uni[(off << 8) + (ch & 0xff)] == wch);
}
return (table->cp2uni[ch & 0xff] == wch);
}
return 1;
}
/* query necessary dst length for src string */
static inline int get_length_dbcs( const struct dbcs_table *table,
const WCHAR *src, unsigned int srclen )
static int get_length_dbcs( const struct dbcs_table *table, int flags,
const WCHAR *src, unsigned int srclen,
const char *defchar )
{
const unsigned short * const uni2cp_low = table->uni2cp_low;
const unsigned short * const uni2cp_high = table->uni2cp_high;
WCHAR defchar_value = table->info.def_char;
WCHAR composed;
int len;
if (!defchar && !(flags & WC_COMPOSITECHECK))
{
for (len = 0; srclen; srclen--, src++, len++)
{
if (uni2cp_low[uni2cp_high[*src >> 8] + (*src & 0xff)] & 0xff00) len++;
@ -108,6 +240,48 @@ static inline int get_length_dbcs( const struct dbcs_table *table,
return len;
}
if (defchar) defchar_value = defchar[1] ? ((defchar[0] << 8) | defchar[1]) : defchar[0];
for (len = 0; srclen; len++, srclen--, src++)
{
unsigned short res;
WCHAR wch = *src;
if ((flags & WC_COMPOSITECHECK) && (srclen > 1) && (composed = compose(src)))
{
/* now check if we can use the composed char */
res = uni2cp_low[uni2cp_high[composed >> 8] + (composed & 0xff)];
if (is_valid_dbcs_mapping( table, flags, composed, res ))
{
/* we have a good mapping for the composed char, use it */
if (res & 0xff00) len++;
src++;
srclen--;
continue;
}
/* no mapping for the composed char, check the other flags */
if (flags & WC_DEFAULTCHAR) /* use the default char instead */
{
if (defchar_value & 0xff00) len++;
src++; /* skip the non-spacing char */
srclen--;
continue;
}
if (flags & WC_DISCARDNS) /* skip the second char of the composition */
{
src++;
srclen--;
}
/* WC_SEPCHARS is the default */
}
res = uni2cp_low[uni2cp_high[wch >> 8] + (wch & 0xff)];
if (!is_valid_dbcs_mapping( table, flags, wch, res )) res = defchar_value;
if (res & 0xff00) len++;
}
return len;
}
/* wcstombs for double-byte code page */
static inline int wcstombs_dbcs( const struct dbcs_table *table,
const WCHAR *src, unsigned int srclen,
@ -138,11 +312,10 @@ static int wcstombs_dbcs_slow( const struct dbcs_table *table, int flags,
char *dst, unsigned int dstlen,
const char *defchar, int *used )
{
const WCHAR * const cp2uni = table->cp2uni;
const unsigned short * const uni2cp_low = table->uni2cp_low;
const unsigned short * const uni2cp_high = table->uni2cp_high;
const unsigned char * const cp2uni_lb = table->cp2uni_leadbytes;
WCHAR defchar_value = table->info.def_char;
WCHAR composed;
int len, tmp;
if (defchar) defchar_value = defchar[1] ? ((defchar[0] << 8) | defchar[1]) : defchar[0];
@ -150,32 +323,46 @@ static int wcstombs_dbcs_slow( const struct dbcs_table *table, int flags,
for (len = dstlen; srclen && len; len--, srclen--, src++)
{
unsigned short res = uni2cp_low[uni2cp_high[*src >> 8] + (*src & 0xff)];
unsigned short res;
WCHAR wch = *src;
if (res == table->info.def_char && *src != table->info.def_unicode_char)
if ((flags & WC_COMPOSITECHECK) && (srclen > 1) && (composed = compose(src)))
{
/* now check if we can use the composed char */
res = uni2cp_low[uni2cp_high[composed >> 8] + (composed & 0xff)];
if (is_valid_dbcs_mapping( table, flags, composed, res ))
{
/* we have a good mapping for the composed char, use it */
src++;
srclen--;
goto output_char;
}
/* no mapping for the composed char, check the other flags */
if (flags & WC_DEFAULTCHAR) /* use the default char instead */
{
res = defchar_value;
*used = 1;
src++; /* skip the non-spacing char */
srclen--;
goto output_char;
}
else if (flags & WC_NO_BEST_FIT_CHARS)
if (flags & WC_DISCARDNS) /* skip the second char of the composition */
{
/* check if char maps back to the same Unicode value */
if (res & 0xff00)
{
unsigned char off = cp2uni_lb[res >> 8];
if (cp2uni[(off << 8) + (res & 0xff)] != *src)
{
res = defchar_value;
*used = 1;
}
}
else if (cp2uni[res & 0xff] != *src)
{
res = defchar_value;
*used = 1;
src++;
srclen--;
}
/* WC_SEPCHARS is the default */
}
res = uni2cp_low[uni2cp_high[wch >> 8] + (wch & 0xff)];
if (!is_valid_dbcs_mapping( table, flags, wch, res ))
{
res = defchar_value;
*used = 1;
}
output_char:
if (res & 0xff00)
{
if (len == 1) break; /* do not output a partial char */
@ -196,7 +383,7 @@ int cp_wcstombs( const union cptable *table, int flags,
{
if (table->info.char_size == 1)
{
if (!dstlen) return srclen;
if (!dstlen) return get_length_sbcs( &table->sbcs, flags, src, srclen );
if (flags || defchar || used)
return wcstombs_sbcs_slow( &table->sbcs, flags, src, srclen,
dst, dstlen, defchar, used );
@ -204,7 +391,7 @@ int cp_wcstombs( const union cptable *table, int flags,
}
else /* mbcs */
{
if (!dstlen) return get_length_dbcs( &table->dbcs, src, srclen );
if (!dstlen) return get_length_dbcs( &table->dbcs, flags, src, srclen, defchar );
if (flags || defchar || used)
return wcstombs_dbcs_slow( &table->dbcs, flags, src, srclen,
dst, dstlen, defchar, used );