From d87d4a4a045653aa03543c2ac122402bb6339def Mon Sep 17 00:00:00 2001 From: Alexandre Julliard Date: Tue, 17 Mar 2020 11:26:47 +0100 Subject: [PATCH] unicode: Store data for CT_CTYPE3 types. Signed-off-by: Alexandre Julliard --- dlls/kernelbase/wctype.c | 4 +- libs/port/wctype.c | 4 +- tools/make_unicode | 105 +++++++++++++++++++++++++++------------ 3 files changed, 78 insertions(+), 35 deletions(-) diff --git a/dlls/kernelbase/wctype.c b/dlls/kernelbase/wctype.c index 7f80034abe7..b3417e54020 100644 --- a/dlls/kernelbase/wctype.c +++ b/dlls/kernelbase/wctype.c @@ -183,9 +183,9 @@ const unsigned short DECLSPEC_HIDDEN wctype_table[6480] = 0x0220, 0x0220, 0x0220, 0x0220, 0x0220, 0x0220, 0x0220, 0x0220, 0x0220, 0x0220, 0x0220, 0x0220, 0x0220, 0x0220, 0x0220, 0x7248, 0xb210, 0x5210, 0x5210, 0x5210, 0x5210, 0xb210, 0xb210, 0xb210, - 0xb210, 0x1310, 0xb210, 0xb210, 0x0230, 0xb210, 0xb210, 0x5210, + 0xb210, 0x1312, 0xb210, 0xb210, 0x0230, 0xb210, 0xb210, 0x5210, 0x5210, 0x3214, 0x3214, 0xb210, 0x1312, 0xb210, 0xb210, 0xb210, - 0x3214, 0x1310, 0xb210, 0xb210, 0xb210, 0xb210, 0xb210, 0x1301, + 0x3214, 0x1312, 0xb210, 0xb210, 0xb210, 0xb210, 0xb210, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0xb210, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1302, diff --git a/libs/port/wctype.c b/libs/port/wctype.c index a020f48847b..9b38bd0a0e2 100644 --- a/libs/port/wctype.c +++ b/libs/port/wctype.c @@ -60,9 +60,9 @@ const unsigned short wine_wctype_table[16242] = 0xe220, 0xe220, 0xe220, 0xe220, 0xe220, 0xe220, 0xe220, 0xe220, 0xe220, 0xe220, 0xe220, 0xe220, 0xe220, 0xe220, 0xe220, 0xe220, 0x7248, 0xb210, 0x5210, 0x5210, 0x5210, 0x5210, 0xb210, 0xb210, - 0xb210, 0xb210, 0x1310, 0xb210, 0xb210, 0xe230, 0xb210, 0xb210, + 0xb210, 0xb210, 0x1312, 0xb210, 0xb210, 0xe230, 0xb210, 0xb210, 0x5210, 0x5210, 0x3214, 0x3214, 0xb210, 0x1312, 0xb210, 0xb210, - 0xb210, 0x3214, 0x1310, 0xb210, 0xb210, 0xb210, 0xb210, 0xb210, + 0xb210, 0x3214, 0x1312, 0xb210, 0xb210, 0xb210, 0xb210, 0xb210, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0x1301, 0xb210, diff --git a/tools/make_unicode b/tools/make_unicode index 454dba538f1..2b315206c2a 100755 --- a/tools/make_unicode +++ b/tools/make_unicode @@ -110,6 +110,7 @@ my @allfiles = my %ctype = ( + # CT_CTYPE1 "upper" => 0x0001, "lower" => 0x0002, "digit" => 0x0004, @@ -118,8 +119,22 @@ my %ctype = "cntrl" => 0x0020, "blank" => 0x0040, "xdigit" => 0x0080, - "alpha" => 0x0100, - "defin" => 0x0200 + "alpha" => 0x0100 | 0x80000000, + "defin" => 0x0200, + # CT_CTYPE3 in high 16 bits + "nonspacing" => 0x00010000, + "diacritic" => 0x00020000, + "vowelmark" => 0x00040000, + "symbol" => 0x00080000, + "katakana" => 0x00100000, + "hiragana" => 0x00200000, + "halfwidth" => 0x00400000, + "fullwidth" => 0x00800000, + "ideograph" => 0x01000000, + "kashida" => 0x02000000, + "lexical" => 0x04000000, + "highsurrogate" => 0x08000000, + "lowsurrogate" => 0x10000000, ); my %bracket_types = @@ -248,7 +263,7 @@ my %categories = "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"}, # Letter, Titlecase - "Mn" => $ctype{"defin"}, # Mark, Non-Spacing + "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing "Mc" => $ctype{"defin"}, # Mark, Spacing Combining "Me" => $ctype{"defin"}, # Mark, Enclosing "Nd" => $ctype{"defin"}|$ctype{"digit"}, # Number, Decimal Digit @@ -271,10 +286,10 @@ my %categories = "Pi" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Initial quote "Pf" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Final quote "Po" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Other - "Sm" => $ctype{"defin"}, # Symbol, Math - "Sc" => $ctype{"defin"}, # Symbol, Currency - "Sk" => $ctype{"defin"}, # Symbol, Modifier - "So" => $ctype{"defin"} # Symbol, Other + "Sm" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Math + "Sc" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Currency + "Sk" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Modifier + "So" => $ctype{"defin"}|$ctype{"symbol"} # Symbol, Other ); # a few characters need additional categories that cannot be determined automatically @@ -291,7 +306,31 @@ my %special_categories = "punct" => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe, 0xd7, 0xf7 ], "digit" => [ 0xb2, 0xb3, 0xb9 ], - "lower" => [ 0x2071, 0x207f ] + "lower" => [ 0xaa, 0xba, 0x2071, 0x207f ], + "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef, + 0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ], + "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ], + "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40, + 0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf, + 0x02b9..0x02ba, 0x02c6..0x02cf ], + "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ], + "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c, + 0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, + 0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf, + 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb, + 0x3131..0x3164 ], + "ideograph" => [ 0x3006..0x3007 ], + "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e, + 0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba, + 0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3, + 0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1, + 0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015, + 0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c, + 0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d, + 0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32, + 0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f, + 0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ], + "kashida" => [ 0x0640 ], ); my %directions = @@ -590,7 +629,6 @@ sub load_data() my ($code, $name, $cat, $comb, $bidi, $decomp, $dec, $dig, $num, $mirror, $oldname, $comment, $upper, $lower, $title) = split /;/; - my $dst; my $src = hex $code; die "unknown category $cat" unless defined $categories{$cat}; @@ -618,6 +656,19 @@ sub load_data() } $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use + $category_table[$src] |= $ctype{"nonspacing"} if $bidi eq "NSM"; + $category_table[$src] |= $ctype{"diacritic"} if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/; + $category_table[$src] |= $ctype{"vowelmark"} if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/; + $category_table[$src] |= $ctype{"halfwidth"} if $name =~ /^HALFWIDTH\s/; + $category_table[$src] |= $ctype{"fullwidth"} if $name =~ /^FULLWIDTH\s/; + $category_table[$src] |= $ctype{"hiragana"} if $name =~ /(HIRAGANA)|(\WKANA\W)/; + $category_table[$src] |= $ctype{"katakana"} if $name =~ /(KATAKANA)|(\WKANA\W)/; + $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^/) { $start = $src; } if ($name =~ /, Last>/) @@ -645,30 +696,15 @@ sub load_data() if ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial") { ${joining_forms{$1}}[hex $2] = $src; - next; } - next unless ($1 eq "font" || - $1 eq "noBreak" || - $1 eq "circle" || - $1 eq "super" || - $1 eq "sub" || - $1 eq "wide" || - $1 eq "narrow" || - $1 eq "compat" || - $1 eq "small"); - $dst = hex $2; } elsif ($decomp =~ /^\s+0020\s+([0-9a-fA-F]+)/) { # decomposition " 0020 1234" -> combining accent - $dst = hex $1; } elsif ($decomp =~ /^([0-9a-fA-F]+)/) { - # decomposition contains only char values without prefix -> use first char - $dst = hex $1; - $category_table[$src] |= $category_table[$dst] if defined $category_table[$dst]; - # store decomposition if it contains two chars + # store decomposition if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/) { $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ]; @@ -679,20 +715,27 @@ sub load_data() $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1 ]; } } - else - { - next; - } } close $UNICODE_DATA; # patch the category of some special characters + for (my $i = 0; $i < @decomp_table; $i++) + { + next unless defined $decomp_table[$i]; + $category_table[$i] |= $category_table[$decomp_table[$i]->[0]]; + } foreach my $cat (keys %special_categories) { my $flag = $ctype{$cat}; foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; } } + for (my $i = 0; $i < @decomp_compat_table; $i++) + { + next unless defined $decomp_compat_table[$i]; + next unless @{$decomp_compat_table[$i]} == 2; + $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"}; + } # load the composition exclusions @@ -1844,7 +1887,7 @@ sub dump_string_type_table($) printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n"; printf OUTPUT "#include \"windef.h\"\n\n"; - my @table = @category_table; + my @table = map { ($_ || 0) & 0xffff; } @category_table; # add the direction in the high 4 bits of the category for (my $i = 0; $i < 65536; $i++) @@ -1895,7 +1938,7 @@ sub dump_ctype_tables($) printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n"; printf OUTPUT "#include \"windef.h\"\n\n"; - my @table = @category_table; + my @table = map { ($_ || 0) & 0xffff; } @category_table; # add the direction in the high 4 bits of the category for (my $i = 0; $i < 65536; $i++)