unicode: Store data for CT_CTYPE3 types.

Signed-off-by: Alexandre Julliard <julliard@winehq.org>
This commit is contained in:
Alexandre Julliard 2020-03-17 11:26:47 +01:00
parent 57a6033c0a
commit d87d4a4a04
3 changed files with 78 additions and 35 deletions

View File

@ -183,9 +183,9 @@ const unsigned short DECLSPEC_HIDDEN wctype_table[6480] =
0x0220, 0x0220, 0x0220, 0x0220, 0x0220, 0x0220, 0x0220, 0x0220,
0x0220, 0x0220, 0x0220, 0x0220, 0x0220, 0x0220, 0x0220, 0x7248,
0xb210, 0x5210, 0x5210, 0x5210, 0x5210, 0xb210, 0xb210, 0xb210,
0xb210, 0x1310, 0xb210, 0xb210, 0x0230, 0xb210, 0xb210, 0x5210,
0xb210, 0x1312, 0xb210, 0xb210, 0x0230, 0xb210, 0xb210, 0x5210,
0x5210, 0x3214, 0x3214, 0xb210, 0x1312, 0xb210, 0xb210, 0xb210,
0x3214, 0x1310, 0xb210, 0xb210, 0xb210, 0xb210, 0xb210, 0x1301,
0x3214, 0x1312, 0xb210, 0xb210, 0xb210, 0xb210, 0xb210, 0x1301,
0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301,
0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0xb210,
0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1302,

View File

@ -60,9 +60,9 @@ const unsigned short wine_wctype_table[16242] =
0xe220, 0xe220, 0xe220, 0xe220, 0xe220, 0xe220, 0xe220, 0xe220,
0xe220, 0xe220, 0xe220, 0xe220, 0xe220, 0xe220, 0xe220, 0xe220,
0x7248, 0xb210, 0x5210, 0x5210, 0x5210, 0x5210, 0xb210, 0xb210,
0xb210, 0xb210, 0x1310, 0xb210, 0xb210, 0xe230, 0xb210, 0xb210,
0xb210, 0xb210, 0x1312, 0xb210, 0xb210, 0xe230, 0xb210, 0xb210,
0x5210, 0x5210, 0x3214, 0x3214, 0xb210, 0x1312, 0xb210, 0xb210,
0xb210, 0x3214, 0x1310, 0xb210, 0xb210, 0xb210, 0xb210, 0xb210,
0xb210, 0x3214, 0x1312, 0xb210, 0xb210, 0xb210, 0xb210, 0xb210,
0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301,
0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301,
0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0xb210,

View File

@ -110,6 +110,7 @@ my @allfiles =
my %ctype =
(
# CT_CTYPE1
"upper" => 0x0001,
"lower" => 0x0002,
"digit" => 0x0004,
@ -118,8 +119,22 @@ my %ctype =
"cntrl" => 0x0020,
"blank" => 0x0040,
"xdigit" => 0x0080,
"alpha" => 0x0100,
"defin" => 0x0200
"alpha" => 0x0100 | 0x80000000,
"defin" => 0x0200,
# CT_CTYPE3 in high 16 bits
"nonspacing" => 0x00010000,
"diacritic" => 0x00020000,
"vowelmark" => 0x00040000,
"symbol" => 0x00080000,
"katakana" => 0x00100000,
"hiragana" => 0x00200000,
"halfwidth" => 0x00400000,
"fullwidth" => 0x00800000,
"ideograph" => 0x01000000,
"kashida" => 0x02000000,
"lexical" => 0x04000000,
"highsurrogate" => 0x08000000,
"lowsurrogate" => 0x10000000,
);
my %bracket_types =
@ -248,7 +263,7 @@ my %categories =
"Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
"Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
"Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"}, # Letter, Titlecase
"Mn" => $ctype{"defin"}, # Mark, Non-Spacing
"Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
"Mc" => $ctype{"defin"}, # Mark, Spacing Combining
"Me" => $ctype{"defin"}, # Mark, Enclosing
"Nd" => $ctype{"defin"}|$ctype{"digit"}, # Number, Decimal Digit
@ -271,10 +286,10 @@ my %categories =
"Pi" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Initial quote
"Pf" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Final quote
"Po" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Other
"Sm" => $ctype{"defin"}, # Symbol, Math
"Sc" => $ctype{"defin"}, # Symbol, Currency
"Sk" => $ctype{"defin"}, # Symbol, Modifier
"So" => $ctype{"defin"} # Symbol, Other
"Sm" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Math
"Sc" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Currency
"Sk" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Modifier
"So" => $ctype{"defin"}|$ctype{"symbol"} # Symbol, Other
);
# a few characters need additional categories that cannot be determined automatically
@ -291,7 +306,31 @@ my %special_categories =
"punct" => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
0xd7, 0xf7 ],
"digit" => [ 0xb2, 0xb3, 0xb9 ],
"lower" => [ 0x2071, 0x207f ]
"lower" => [ 0xaa, 0xba, 0x2071, 0x207f ],
"nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
"diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
"symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
0x02b9..0x02ba, 0x02c6..0x02cf ],
"halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
"fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
0x3131..0x3164 ],
"ideograph" => [ 0x3006..0x3007 ],
"lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
"kashida" => [ 0x0640 ],
);
my %directions =
@ -590,7 +629,6 @@ sub load_data()
my ($code, $name, $cat, $comb, $bidi,
$decomp, $dec, $dig, $num, $mirror,
$oldname, $comment, $upper, $lower, $title) = split /;/;
my $dst;
my $src = hex $code;
die "unknown category $cat" unless defined $categories{$cat};
@ -618,6 +656,19 @@ sub load_data()
}
$combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
$category_table[$src] |= $ctype{"nonspacing"} if $bidi eq "NSM";
$category_table[$src] |= $ctype{"diacritic"} if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
$category_table[$src] |= $ctype{"vowelmark"} if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
$category_table[$src] |= $ctype{"halfwidth"} if $name =~ /^HALFWIDTH\s/;
$category_table[$src] |= $ctype{"fullwidth"} if $name =~ /^FULLWIDTH\s/;
$category_table[$src] |= $ctype{"hiragana"} if $name =~ /(HIRAGANA)|(\WKANA\W)/;
$category_table[$src] |= $ctype{"katakana"} if $name =~ /(KATAKANA)|(\WKANA\W)/;
$category_table[$src] |= $ctype{"ideograph"} if $name =~ /^<CJK Ideograph/;
$category_table[$src] |= $ctype{"ideograph"} if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
$category_table[$src] |= $ctype{"ideograph"} if $name =~ /^HANGZHOU/;
$category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
$category_table[$src] |= $ctype{"lowsurrogate"} if $name =~ /Low Surrogate/;
# copy the category and direction for everything between First/Last pairs
if ($name =~ /, First>/) { $start = $src; }
if ($name =~ /, Last>/)
@ -645,30 +696,15 @@ sub load_data()
if ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
{
${joining_forms{$1}}[hex $2] = $src;
next;
}
next unless ($1 eq "font" ||
$1 eq "noBreak" ||
$1 eq "circle" ||
$1 eq "super" ||
$1 eq "sub" ||
$1 eq "wide" ||
$1 eq "narrow" ||
$1 eq "compat" ||
$1 eq "small");
$dst = hex $2;
}
elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
{
# decomposition "<compat> 0020 1234" -> combining accent
$dst = hex $1;
}
elsif ($decomp =~ /^([0-9a-fA-F]+)/)
{
# decomposition contains only char values without prefix -> use first char
$dst = hex $1;
$category_table[$src] |= $category_table[$dst] if defined $category_table[$dst];
# store decomposition if it contains two chars
# store decomposition
if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
{
$decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
@ -679,20 +715,27 @@ sub load_data()
$decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1 ];
}
}
else
{
next;
}
}
close $UNICODE_DATA;
# patch the category of some special characters
for (my $i = 0; $i < @decomp_table; $i++)
{
next unless defined $decomp_table[$i];
$category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
}
foreach my $cat (keys %special_categories)
{
my $flag = $ctype{$cat};
foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
}
for (my $i = 0; $i < @decomp_compat_table; $i++)
{
next unless defined $decomp_compat_table[$i];
next unless @{$decomp_compat_table[$i]} == 2;
$category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
}
# load the composition exclusions
@ -1844,7 +1887,7 @@ sub dump_string_type_table($)
printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
printf OUTPUT "#include \"windef.h\"\n\n";
my @table = @category_table;
my @table = map { ($_ || 0) & 0xffff; } @category_table;
# add the direction in the high 4 bits of the category
for (my $i = 0; $i < 65536; $i++)
@ -1895,7 +1938,7 @@ sub dump_ctype_tables($)
printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
printf OUTPUT "#include \"windef.h\"\n\n";
my @table = @category_table;
my @table = map { ($_ || 0) & 0xffff; } @category_table;
# add the direction in the high 4 bits of the category
for (my $i = 0; $i < 65536; $i++)