2007-10-29 16:47:50 +01:00
|
|
|
#!/usr/bin/perl
|
|
|
|
|
|
|
|
#############################
|
|
|
|
# unicodereplace.pl - replaces high codepoints in infile with \uHEX escapes
|
|
|
|
####
|
|
|
|
# Usage: unicodereplace.pl infile outfile
|
|
|
|
#############################
|
2008-12-27 00:36:39 +01:00
|
|
|
# oneliner version:
|
|
|
|
# perl -C7 -ne'print map {$c=ord; ($c < 127 or $c == 0xFEFF or $c == 0xFFFE) ? $_ : sprintf "\\u%04x", $c} split //;' infile > outfile
|
2007-10-29 16:47:50 +01:00
|
|
|
|
|
|
|
use warnings;
|
|
|
|
use strict;
|
|
|
|
use utf8;
|
|
|
|
|
|
|
|
my ($infile, $outfile) = @ARGV; # get arguments
|
|
|
|
|
|
|
|
open(INFILE, "<:utf8", $infile) or die("Can't open $infile: $!");
|
|
|
|
open(OUTFILE, ">:utf8", $outfile) or die("Can't open $outfile: $!");
|
|
|
|
|
|
|
|
# loop over lines in the infile
|
|
|
|
while (<INFILE>) {
|
2007-10-29 16:49:46 +01:00
|
|
|
for (split('', $_)) { # loop over characters in the line
|
2007-10-29 16:47:50 +01:00
|
|
|
my $cp = ord;
|
2007-10-29 16:49:46 +01:00
|
|
|
if ($cp < 127 or $cp == 0xFFFE or $cp == 0xFEFF) { # is it a normal ASCII codepoint or a BOM?
|
|
|
|
print OUTFILE $_; # then pass through unchanged
|
2007-10-29 16:47:50 +01:00
|
|
|
}
|
|
|
|
else {
|
2007-10-29 16:49:46 +01:00
|
|
|
print OUTFILE "\\u", sprintf("%04x", $cp); # otherwise print as \uHEX
|
2007-10-29 16:47:50 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|