Aegisub/devel/traydict/unicodereplace.pl

#!/usr/bin/perl

#############################
# unicodereplace.pl - replaces high codepoints in infile with \uHEX escapes
####
# Usage: unicodereplace.pl infile outfile
#############################
# oneliner version:
# perl -C7 -ne'print map {$c=ord; ($c < 127 or $c == 0xFEFF or $c == 0xFFFE) ? $_ : sprintf "\\u%04x", $c} split //;' infile > outfile

use warnings;
use strict;
use utf8;

my ($infile, $outfile) = @ARGV; # get arguments

open(INFILE, "<:utf8", $infile) or die("Can't open $infile: $!");
open(OUTFILE, ">:utf8", $outfile) or die("Can't open $outfile: $!");

# loop over lines in the infile
while (<INFILE>) {
	for (split('', $_)) { # loop over characters in the line
		my $cp = ord;
		if ($cp < 127 or $cp == 0xFFFE or $cp == 0xFEFF) { # is it a normal ASCII codepoint or a BOM?
			print OUTFILE $_; # then pass through unchanged
		} 
		else {
			print OUTFILE "\\u", sprintf("%04x", $cp); # otherwise print as \uHEX
		}
	}
}
added unicodereplace.pl script Originally committed to SVN as r1637. 2007-10-29 16:47:50 +01:00			`#!/usr/bin/perl`

			`#############################`
			`# unicodereplace.pl - replaces high codepoints in infile with \uHEX escapes`
			`####`
			`# Usage: unicodereplace.pl infile outfile`
			`#############################`
add oneliner version of unicodereplace.pl to the script itself so I don't forget about it Originally committed to SVN as r2524. 2008-12-27 00:36:39 +01:00			`# oneliner version:`
			`# perl -C7 -ne'print map {$c=ord; ($c < 127 or $c == 0xFEFF or $c == 0xFFFE) ? $_ : sprintf "\\u%04x", $c} split //;' infile > outfile`
added unicodereplace.pl script Originally committed to SVN as r1637. 2007-10-29 16:47:50 +01:00
			`use warnings;`
			`use strict;`
			`use utf8;`

			`my ($infile, $outfile) = @ARGV; # get arguments`

			`open(INFILE, "<:utf8", $infile) or die("Can't open $infile: $!");`
			`open(OUTFILE, ">:utf8", $outfile) or die("Can't open $outfile: $!");`

			`# loop over lines in the infile`
			`while (<INFILE>) {`
fixed tabs, argh Originally committed to SVN as r1638. 2007-10-29 16:49:46 +01:00			`for (split('', $_)) { # loop over characters in the line`
added unicodereplace.pl script Originally committed to SVN as r1637. 2007-10-29 16:47:50 +01:00			`my $cp = ord;`
fixed tabs, argh Originally committed to SVN as r1638. 2007-10-29 16:49:46 +01:00			`if ($cp < 127 or $cp == 0xFFFE or $cp == 0xFEFF) { # is it a normal ASCII codepoint or a BOM?`
			`print OUTFILE $_; # then pass through unchanged`
added unicodereplace.pl script Originally committed to SVN as r1637. 2007-10-29 16:47:50 +01:00			`}`
			`else {`
fixed tabs, argh Originally committed to SVN as r1638. 2007-10-29 16:49:46 +01:00			`print OUTFILE "\\u", sprintf("%04x", $cp); # otherwise print as \uHEX`
added unicodereplace.pl script Originally committed to SVN as r1637. 2007-10-29 16:47:50 +01:00			`}`
			`}`
			`}`