diff --git a/traydict/unicodereplace.pl b/traydict/unicodereplace.pl new file mode 100644 index 000000000..12dd34190 --- /dev/null +++ b/traydict/unicodereplace.pl @@ -0,0 +1,29 @@ +#!/usr/bin/perl + +############################# +# unicodereplace.pl - replaces high codepoints in infile with \uHEX escapes +#### +# Usage: unicodereplace.pl infile outfile +############################# + +use warnings; +use strict; +use utf8; + +my ($infile, $outfile) = @ARGV; # get arguments + +open(INFILE, "<:utf8", $infile) or die("Can't open $infile: $!"); +open(OUTFILE, ">:utf8", $outfile) or die("Can't open $outfile: $!"); + +# loop over lines in the infile +while () { + for (split('', $_)) { # loop over characters in the line + my $cp = ord; + if ($cp < 127 or $cp == 0xFFFE or $cp == 0xFEFF) { # is it a normal ASCII codepoint or a BOM? + print OUTFILE $_; # then pass through unchanged + } + else { + print OUTFILE "\\u", sprintf("%04x", $cp); # otherwise print as \uHEX + } + } +}