From a40b1d0028bdb110779b3349337d3c66dfddc491 Mon Sep 17 00:00:00 2001 From: Thomas Goyne Date: Sun, 28 Apr 2013 08:42:31 -0700 Subject: [PATCH] Make unicode.lua a proper module and add basic tests Also drop support for >4 byte sequences as they were eliminated in RFC 3629. --- .../automation/include/aegisub/unicode.lua | 58 ++++++++ .../automation/include/aegisub/unicode.moon | 87 +++++++++++ aegisub/automation/include/unicode.lua | 140 ++---------------- aegisub/automation/tests/modules/unicode.moon | 39 +++++ 4 files changed, 199 insertions(+), 125 deletions(-) create mode 100644 aegisub/automation/include/aegisub/unicode.lua create mode 100644 aegisub/automation/include/aegisub/unicode.moon create mode 100644 aegisub/automation/tests/modules/unicode.moon diff --git a/aegisub/automation/include/aegisub/unicode.lua b/aegisub/automation/include/aegisub/unicode.lua new file mode 100644 index 000000000..e889b0202 --- /dev/null +++ b/aegisub/automation/include/aegisub/unicode.lua @@ -0,0 +1,58 @@ +local unicode +unicode = { + charwidth = function(s, i) + local b = s:byte(i or 1) + if not b then + return 1 + elseif b < 128 then + return 1 + elseif b < 224 then + return 2 + elseif b < 240 then + return 3 + else + return 4 + end + end, + chars = function(s) + local curchar, i = 0, 1 + return function() + if i > s:len() then + return + end + local j = i + curchar = curchar + 1 + i = i + unicode.charwidth(s, i) + return s:sub(j, i - 1), curchar + end + end, + len = function(s) + local n = 0 + for c in unicode.chars(s) do + n = n + 1 + end + return n + end, + codepoint = function(s) + local b = s:byte(1) + if b < 128 then + return b + end + local res, w + if b < 224 then + res = b - 192 + w = 2 + elseif b < 240 then + res = b - 224 + w = 3 + else + res = b - 240 + w = 4 + end + for i = 2, w do + res = res * 64 + s:byte(i) - 128 + end + return res + end +} +return unicode diff --git a/aegisub/automation/include/aegisub/unicode.moon b/aegisub/automation/include/aegisub/unicode.moon new file mode 100644 index 000000000..20e8895e1 --- /dev/null +++ b/aegisub/automation/include/aegisub/unicode.moon @@ -0,0 +1,87 @@ +-- Copyright (c) 2007, Niels Martin Hansen, Rodrigo Braz Monteiro +-- All rights reserved. +-- +-- Redistribution and use in source and binary forms, with or without +-- modification, are permitted provided that the following conditions are met: +-- +-- * Redistributions of source code must retain the above copyright notice, +-- this list of conditions and the following disclaimer. +-- * Redistributions in binary form must reproduce the above copyright notice, +-- this list of conditions and the following disclaimer in the documentation +-- and/or other materials provided with the distribution. +-- * Neither the name of the Aegisub Group nor the names of its contributors +-- may be used to endorse or promote products derived from this software +-- without specific prior written permission. +-- +-- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +-- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +-- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +-- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +-- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +-- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +-- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +-- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +-- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +-- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +-- POSSIBILITY OF SUCH DAMAGE. + +-- Unicode (UTF-8) support functions for Aegisub Automation 4 Lua +-- http://www.ietf.org/rfc/rfc2279.txt + +local unicode +unicode = + -- Return the number of bytes occupied by the character starting at the i'th byte in s + charwidth: (s, i) -> + b = s\byte i or 1 + -- FIXME, something in karaskel results in this case, shouldn't happen + -- What would "proper" behaviour be? Zero? Or just explode? + if not b then 1 + elseif b < 128 then 1 + elseif b < 224 then 2 + elseif b < 240 then 3 + else 4 + + -- Returns an iterator function for iterating over the characters in s + chars: (s) -> + curchar, i = 0, 1 + -> + return if i > s\len() + + j = i + curchar += 1 + i += unicode.charwidth s, i + s\sub(j, i - 1), curchar + + -- Returns the number of characters in s + -- Runs in O(s:len()) time! + len: (s) -> + n = 0 + n += 1 for c in unicode.chars s + n + + -- Get codepoint of first char in s + codepoint: (s) -> + -- Basic case, ASCII + b = s\byte 1 + return b if b < 128 + + -- Use a naive decoding algorithm, and assume input is valid + local res, w + + if b < 224 then + -- prefix byte is 110xxxxx + res = b - 192 + w = 2 + elseif b < 240 then + -- prefix byte is 11100000 + res = b - 224 + w = 3 + else + res = b - 240 + w = 4 + + for i = 2, w + res = res*64 + s\byte(i) - 128 + res + +return unicode diff --git a/aegisub/automation/include/unicode.lua b/aegisub/automation/include/unicode.lua index be4611b3a..caed5bbc0 100644 --- a/aegisub/automation/include/unicode.lua +++ b/aegisub/automation/include/unicode.lua @@ -1,126 +1,16 @@ -ļ»æ--[[ - Copyright (c) 2007, Niels Martin Hansen, Rodrigo Braz Monteiro - All rights reserved. +ļ»æ-- Copyright (c) 2013, Thomas Goyne +-- +-- Permission to use, copy, modify, and distribute this software for any +-- purpose with or without fee is hereby granted, provided that the above +-- copyright notice and this permission notice appear in all copies. +-- +-- THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +-- WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +-- MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +-- ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +-- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +-- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +-- OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of the Aegisub Group nor the names of its contributors - may be used to endorse or promote products derived from this software - without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - POSSIBILITY OF SUCH DAMAGE. -]] - --- Unicode (UTF-8) support functions for Aegisub Automation 4 Lua --- http://www.ietf.org/rfc/rfc2279.txt - -module("unicode") - --- Return the number of bytes occupied by the character starting at the i'th byte in s -function charwidth(s, i) - local b = s:byte(i or 1) - if not b then - --aegisub.debug.out(3, "unicode.charwidth of '%s' @ %d, nil byte\n", s, i) - -- FIXME, something in karaskel results in this case, shouldn't happen - -- What would "proper" behaviour be? Zero? Or just explode? - return 1 - elseif b < 128 then - return 1 - elseif b < 224 then - return 2 - elseif b < 240 then - return 3 - elseif b < 248 then - return 4 - elseif b < 252 then - return 5 - else - return 6 - end - -- Actually there are more possibilities, but those aren't really legal -end - --- Returns an iterator function for iterating over the characters in s -function chars(s) - local curchar, i = 0, 1 - - local function itor() - if i > s:len() then - return nil - end - - local width = charwidth(s, i) - local j = i - curchar = curchar + 1 - i = i + width - return s:sub(j, i-1), curchar - end - - return itor -end - --- Returns the number of characters in s --- Runs in O(s:len()) time! -function len(s) - local n = 0 - for c in chars(s) do - n = n + 1 - end - return n -end - --- Get codepoint of first char in s -function codepoint(s) - -- Basic case, ASCII - local b = s:byte(1) - if s:byte(1) < 128 then - return s:byte(1) - end - - -- Use a naive decoding algorithm, and assume input is valid - local res, w = 0 - - if b < 224 then - -- prefix byte is 110xxxxx - res = b - 192 - w = 2 - elseif b < 240 then - -- prefix byte is 11100000 - res = b - 224 - w = 3 - elseif b < 248 then - -- prefix byte is 11110000 - res = b - 240 - w = 4 - elseif b < 252 then - -- prefix byte is 11111000 - res = b - 248 - w = 5 - else - -- prefix byte is 11111100 - res = b - 252 - w = 6 - end - - for i = 2, w do - res = res*64 + s:byte(i) - 128 - end - - return res -end +_G.unicode = require 'aegisub.unicode' +return _G.unicode diff --git a/aegisub/automation/tests/modules/unicode.moon b/aegisub/automation/tests/modules/unicode.moon new file mode 100644 index 000000000..627f8ca02 --- /dev/null +++ b/aegisub/automation/tests/modules/unicode.moon @@ -0,0 +1,39 @@ +-- Copyright (c) 2013, Thomas Goyne +-- +-- Permission to use, copy, modify, and distribute this software for any +-- purpose with or without fee is hereby granted, provided that the above +-- copyright notice and this permission notice appear in all copies. +-- +-- THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +-- WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +-- MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +-- ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +-- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +-- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +-- OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +require 'lunatest' +unicode = require 'aegisub.unicode' + +export test_char_widths = -> + assert_equal 1, unicode.charwidth 'a' + assert_equal 2, unicode.charwidth 'Ɵ' + assert_equal 3, unicode.charwidth 'ļ½ƒ' + assert_equal 4, unicode.charwidth 'šŸ„“' + +export test_char_iterator = -> + chars = [c for c in unicode.chars 'aƟļ½ƒšŸ„“'] + assert_equal 4, #chars + assert_equal chars[1], 'a' + assert_equal chars[2], 'Ɵ' + assert_equal chars[3], 'ļ½ƒ' + assert_equal chars[4], 'šŸ„“' + +export test_len = -> + assert_equal 4, unicode.len 'aƟļ½ƒšŸ„“' + +export test_codepoint = -> + assert_equal 97, unicode.codepoint 'a' + assert_equal 223, unicode.codepoint 'Ɵ' + assert_equal 0xFF43, unicode.codepoint 'ļ½ƒ' + assert_equal 0x1F113, unicode.codepoint 'šŸ„“'