libwine: Add support in cpmap.pl for parsing the Windows bestfit codepage files.

This commit is contained in:
Alexandre Julliard 2009-01-19 19:21:55 +01:00
parent e098f427b4
commit 97d31ec789
1 changed files with 153 additions and 47 deletions

View File

@ -56,24 +56,24 @@ $DEF_CHAR = ord '?';
[ 865, "VENDORS/MICSFT/PC/CP865.TXT", 1, "OEM Nordic" ],
[ 866, "VENDORS/MICSFT/PC/CP866.TXT", 1, "OEM Russian" ],
[ 869, "VENDORS/MICSFT/PC/CP869.TXT", 1, "OEM Greek" ],
[ 874, "VENDORS/MICSFT/PC/CP874.TXT", 1, "ANSI/OEM Thai" ],
[ 875, "VENDORS/MICSFT/EBCDIC/CP875.TXT", 0, "IBM EBCDIC Greek" ],
[ 878, "VENDORS/MISC/KOI8-R.TXT", 0, "Russian KOI8" ],
[ 932, "VENDORS/MICSFT/WINDOWS/CP932.TXT", 0, "ANSI/OEM Japanese Shift-JIS" ],
[ 936, "VENDORS/MICSFT/WINDOWS/CP936.TXT", 0, "ANSI/OEM Simplified Chinese GBK" ],
[ 949, "VENDORS/MICSFT/WINDOWS/CP949.TXT", 0, "ANSI/OEM Korean Unified Hangul" ],
[ 950, "VENDORS/MICSFT/WINDOWS/CP950.TXT", 0, "ANSI/OEM Traditional Chinese Big5" ],
[ 1006, "VENDORS/MISC/CP1006.TXT", 0, "IBM Arabic" ],
[ 1026, "VENDORS/MICSFT/EBCDIC/CP1026.TXT", 0, "IBM EBCDIC Latin 5 Turkish" ],
[ 1250, "VENDORS/MICSFT/WINDOWS/CP1250.TXT", 0, "ANSI Eastern Europe" ],
[ 1251, "VENDORS/MICSFT/WINDOWS/CP1251.TXT", 0, "ANSI Cyrillic" ],
[ 1252, "VENDORS/MICSFT/WINDOWS/CP1252.TXT", 0, "ANSI Latin 1" ],
[ 1253, "VENDORS/MICSFT/WINDOWS/CP1253.TXT", 0, "ANSI Greek" ],
[ 1254, "VENDORS/MICSFT/WINDOWS/CP1254.TXT", 0, "ANSI Turkish" ],
[ 1255, "VENDORS/MICSFT/WINDOWS/CP1255.TXT", 0, "ANSI Hebrew" ],
[ 1256, "VENDORS/MICSFT/WINDOWS/CP1256.TXT", 0, "ANSI Arabic" ],
[ 1257, "VENDORS/MICSFT/WINDOWS/CP1257.TXT", 0, "ANSI Baltic" ],
[ 1258, "VENDORS/MICSFT/WINDOWS/CP1258.TXT", 0, "ANSI/OEM Viet Nam" ],
[ 874, "VENDORS/MICSFT/WindowsBestFit/bestfit874.txt", 1, "ANSI/OEM Thai" ],
[ 875, "VENDORS/MICSFT/EBCDIC/CP875.TXT", 0, "IBM EBCDIC Greek" ],
[ 878, "VENDORS/MISC/KOI8-R.TXT", 0, "Russian KOI8" ],
[ 932, "VENDORS/MICSFT/WindowsBestFit/bestfit932.txt", 0, "ANSI/OEM Japanese Shift-JIS" ],
[ 936, "VENDORS/MICSFT/WindowsBestFit/bestfit936.txt", 0, "ANSI/OEM Simplified Chinese GBK" ],
[ 949, "VENDORS/MICSFT/WindowsBestFit/bestfit949.txt", 0, "ANSI/OEM Korean Unified Hangul" ],
[ 950, "VENDORS/MICSFT/WindowsBestFit/bestfit950.txt", 0, "ANSI/OEM Traditional Chinese Big5" ],
[ 1006, "VENDORS/MISC/CP1006.TXT", 0, "IBM Arabic" ],
[ 1026, "VENDORS/MICSFT/EBCDIC/CP1026.TXT", 0, "IBM EBCDIC Latin 5 Turkish" ],
[ 1250, "VENDORS/MICSFT/WindowsBestFit/bestfit1250.txt", 0, "ANSI Eastern Europe" ],
[ 1251, "VENDORS/MICSFT/WindowsBestFit/bestfit1251.txt", 0, "ANSI Cyrillic" ],
[ 1252, "VENDORS/MICSFT/WindowsBestFit/bestfit1252.txt", 0, "ANSI Latin 1" ],
[ 1253, "VENDORS/MICSFT/WindowsBestFit/bestfit1253.txt", 0, "ANSI Greek" ],
[ 1254, "VENDORS/MICSFT/WindowsBestFit/bestfit1254.txt", 0, "ANSI Turkish" ],
[ 1255, "VENDORS/MICSFT/WindowsBestFit/bestfit1255.txt", 0, "ANSI Hebrew" ],
[ 1256, "VENDORS/MICSFT/WindowsBestFit/bestfit1256.txt", 0, "ANSI Arabic" ],
[ 1257, "VENDORS/MICSFT/WindowsBestFit/bestfit1257.txt", 0, "ANSI Baltic" ],
[ 1258, "VENDORS/MICSFT/WindowsBestFit/bestfit1258.txt", 0, "ANSI/OEM Viet Nam" ],
[ 1361, "OBSOLETE/EASTASIA/KSC/JOHAB.TXT", 0, "Korean Johab" ],
[ 10000, "VENDORS/MICSFT/MAC/ROMAN.TXT", 0, "Mac Roman" ],
[ 10006, "VENDORS/MICSFT/MAC/GREEK.TXT", 0, "Mac Greek" ],
@ -724,9 +724,9 @@ sub DUMP_ARRAY
################################################################
# dump an SBCS mapping table
sub DUMP_SBCS_TABLE
sub dump_sbcs_table($$$$$)
{
my ($codepage, $has_glyphs, $name) = @_;
my ($codepage, $has_glyphs, $name, $def, $defw) = @_;
my $i;
# output the ascii->unicode table
@ -734,14 +734,14 @@ sub DUMP_SBCS_TABLE
if ($has_glyphs)
{
printf OUTPUT "static const WCHAR cp2uni[512] =\n";
printf OUTPUT "{\n%s", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[0 .. 255] );
printf OUTPUT "{\n%s", DUMP_ARRAY( "0x%04x", $defw, @cp2uni[0 .. 255] );
printf OUTPUT ",\n /* glyphs */\n%s\n};\n\n",
DUMP_ARRAY( "0x%04x", $DEF_CHAR, get_glyphs_mapping(@cp2uni[0 .. 255]) );
DUMP_ARRAY( "0x%04x", $defw, get_glyphs_mapping(@cp2uni[0 .. 255]) );
}
else
{
printf OUTPUT "static const WCHAR cp2uni[256] =\n";
printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[0 .. 255] );
printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", $defw, @cp2uni[0 .. 255] );
}
# count the number of unicode->ascii subtables that contain something
@ -763,10 +763,10 @@ sub DUMP_SBCS_TABLE
{
next unless $filled[$i];
printf OUTPUT " /* 0x%02x00 .. 0x%02xff */\n", $i, $i;
printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%02x", $DEF_CHAR, @uni2cp[($i<<8) .. ($i<<8)+255] );
printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%02x", $def, @uni2cp[($i<<8) .. ($i<<8)+255] );
}
printf OUTPUT " /* defaults */\n";
printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%02x", 0, ($DEF_CHAR) x 256 );
printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%02x", 0, ($def) x 256 );
# output a table of the offsets of the subtables in the previous array
@ -784,7 +784,7 @@ sub DUMP_SBCS_TABLE
printf OUTPUT "const struct sbcs_table cptable_%03d =\n{\n", $codepage;
printf OUTPUT " { %d, 1, 0x%04x, 0x%04x, \"%s\" },\n",
$codepage, $DEF_CHAR, $DEF_CHAR, $name;
$codepage, $def, $defw, $name;
printf OUTPUT " cp2uni,\n";
if ($has_glyphs) { printf OUTPUT " cp2uni + 256,\n"; }
else { printf OUTPUT " cp2uni,\n"; }
@ -795,9 +795,9 @@ sub DUMP_SBCS_TABLE
################################################################
# dump a DBCS mapping table
sub DUMP_DBCS_TABLE
sub dump_dbcs_table($$$$@)
{
my ($codepage, $name) = @_;
my ($codepage, $name, $def, $defw, @lb_ranges) = @_;
my $i, $x, $y;
# build a list of lead bytes that are actually used
@ -820,14 +820,14 @@ sub DUMP_DBCS_TABLE
# output the ascii->unicode table for the single byte chars
printf OUTPUT "static const WCHAR cp2uni[%d] =\n", 256 * ($#lblist + 2 + $unused);
printf OUTPUT "{\n%s,\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[0 .. 255] );
printf OUTPUT "{\n%s,\n", DUMP_ARRAY( "0x%04x", $defw, @cp2uni[0 .. 255] );
# output the default table for unused lead bytes
if ($unused)
{
printf OUTPUT " /* unused lead bytes */\n";
printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", 0, ($DEF_CHAR) x 256 );
printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", 0, ($defw) x 256 );
}
# output the ascii->unicode table for each DBCS lead byte
@ -836,7 +836,7 @@ sub DUMP_DBCS_TABLE
{
my $base = $lblist[$y] << 8;
printf OUTPUT " /* lead byte %02x */\n", $lblist[$y];
printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[$base .. $base+255] );
printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", $defw, @cp2uni[$base .. $base+255] );
printf OUTPUT ($y < $#lblist) ? ",\n" : "\n};\n\n";
}
@ -872,10 +872,10 @@ sub DUMP_DBCS_TABLE
{
next unless $filled[$y];
printf OUTPUT " /* 0x%02x00 .. 0x%02xff */\n", $y, $y;
printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @uni2cp[($y<<8) .. ($y<<8)+255] );
printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", $def, @uni2cp[($y<<8) .. ($y<<8)+255] );
}
printf OUTPUT " /* defaults */\n";
printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, ($DEF_CHAR) x 256 );
printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, ($def) x 256 );
# output a table of the offsets of the subtables in the previous array
@ -893,38 +893,38 @@ sub DUMP_DBCS_TABLE
printf OUTPUT "const struct dbcs_table cptable_%03d =\n{\n", $codepage;
printf OUTPUT " { %d, 2, 0x%04x, 0x%04x, \"%s\" },\n",
$codepage, $DEF_CHAR, $DEF_CHAR, $name;
$codepage, $def, $defw, $name;
printf OUTPUT " cp2uni,\n";
printf OUTPUT " cp2uni_leadbytes,\n";
printf OUTPUT " uni2cp_low,\n";
printf OUTPUT " uni2cp_high,\n";
DUMP_LB_RANGES();
printf OUTPUT " {\n %s\n }\n", DUMP_ARRAY( "0x%02x", 0, @lb_ranges, 0, 0 );
printf OUTPUT "};\n";
}
################################################################
# dump the list of defined lead byte ranges
sub DUMP_LB_RANGES
# get the list of defined lead byte ranges
sub get_lb_ranges()
{
my @list = ();
my @ranges = ();
my $i = 0;
foreach $i (@lead_bytes) { $list[$i] = 1; }
my $on = 0;
printf OUTPUT " { ";
for ($i = 0; $i < 256; $i++)
{
if ($on)
{
if (!defined $list[$i]) { printf OUTPUT "0x%02x, ", $i-1; $on = 0; }
if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
}
else
{
if ($list[$i]) { printf OUTPUT "0x%02x, ", $i; $on = 1; }
if ($list[$i]) { push @ranges, $i; $on = 1; }
}
}
if ($on) { printf OUTPUT "0xff, "; }
printf OUTPUT "0x00, 0x00 }\n";
if ($on) { push @ranges, 0xff; }
return @ranges;
}
@ -1199,6 +1199,110 @@ sub DUMP_COMPOSE_TABLES
}
################################################################
# handle a "bestfit" Windows mapping file
sub handle_bestfit_file($$$)
{
my ($filename, $has_glyphs, $comment) = @_;
my $state = "";
my ($codepage, $width, $def, $defw);
my ($lb_cur, $lb_end);
my @lb_ranges = ();
open INPUT,$MAPPREFIX . $filename or die "Cannot open $name";
while (<INPUT>)
{
next if /^;/; # skip comments
next if /^\s*$/; # skip empty lines
next if /\x1a/; # skip ^Z
last if /^ENDCODEPAGE/;
if (/^CODEPAGE\s+(\d+)/)
{
$codepage = $1;
next;
}
if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
{
$width = $1;
$def = hex $2;
$defw = hex $3;
next;
}
if (/^(MBTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
{
$state = $1;
$count = $2;
next;
}
if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
{
if ($state eq "MBTABLE")
{
$cp = hex $1;
$uni = hex $2;
$cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
next;
}
if ($state eq "WCTABLE")
{
$uni = hex $1;
$cp = hex $2;
$uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
next;
}
if ($state eq "DBCSRANGE")
{
$start = hex $1;
$end = hex $2;
push @lb_ranges, $start, $end;
for (my $i = $start; $i <= $end; $i++)
{
push @lead_bytes, $i;
$cp2uni[$i] = 0;
}
$lb_cur = $start;
$lb_end = $end;
next;
}
if ($state eq "DBCSTABLE")
{
$mb = hex $1;
$uni = hex $2;
$cp = ($lb_cur << 8) | $mb;
$cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
if (!--$count)
{
if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
}
next;
}
}
die "$name: Unrecognized line $_\n";
}
close INPUT;
my $output = sprintf "c_%03d.c", $codepage;
open OUTPUT,">$output.new" or die "Cannot create $output";
printf "Building %s from %s (%s)\n", $output, $filename, $comment;
# dump all tables
printf OUTPUT "/* code page %03d (%s) */\n", $codepage, $comment;
printf OUTPUT "/* generated from %s */\n", $MAPPREFIX . $filename;
printf OUTPUT "/* DO NOT EDIT!! */\n\n";
printf OUTPUT "#include \"wine/unicode.h\"\n\n";
if ($width == 1) { dump_sbcs_table( $codepage, $has_glyphs, $comment, $def, $defw ); }
else { dump_dbcs_table( $codepage, $comment, $def, $defw, @lb_ranges ); }
close OUTPUT;
save_file($output);
}
################################################################
# read an input file and generate the corresponding .c file
sub HANDLE_FILE
@ -1212,11 +1316,13 @@ sub HANDLE_FILE
# symbol codepage file is special
if ($codepage == 20932) { READ_JIS0208_FILE($MAPPREFIX . $filename); }
elsif ($codepage == 20127) { fill_20127_codepage(); }
elsif ($filename =~ /\/bestfit/)
{
handle_bestfit_file( $filename, $has_glyphs, $comment );
return;
}
else { READ_FILE($MAPPREFIX . $filename); }
# hack: 0x00a5 must map to backslash in Shift-JIS
if ($codepage == 932) { $uni2cp[0x00a5] = 0x5c; }
ADD_DEFAULT_MAPPINGS();
my $output = sprintf "c_%03d.c", $codepage;
@ -1238,8 +1344,8 @@ sub HANDLE_FILE
}
printf OUTPUT "#include \"wine/unicode.h\"\n\n";
if ($#lead_bytes == -1) { DUMP_SBCS_TABLE( $codepage, $has_glyphs, $comment ); }
else { DUMP_DBCS_TABLE( $codepage, $comment ); }
if (!@lead_bytes) { dump_sbcs_table( $codepage, $has_glyphs, $comment, $DEF_CHAR, $DEF_CHAR ); }
else { dump_dbcs_table( $codepage, $comment, $DEF_CHAR, $DEF_CHAR, get_lb_ranges() ); }
close OUTPUT;
save_file($output);
}