diff --git a/aegisub/build/libaegisub_vs2008/libaegisub_vs2008.vcproj b/aegisub/build/libaegisub_vs2008/libaegisub_vs2008.vcproj index 17328d587..e66255ba5 100644 --- a/aegisub/build/libaegisub_vs2008/libaegisub_vs2008.vcproj +++ b/aegisub/build/libaegisub_vs2008/libaegisub_vs2008.vcproj @@ -267,6 +267,14 @@ RelativePath="..\..\libaegisub\common\charset.cpp" > + + + + diff --git a/aegisub/build/msbuild/libaegisub/libaegisub.vcxproj b/aegisub/build/msbuild/libaegisub/libaegisub.vcxproj index a103002d7..13937cfe9 100644 --- a/aegisub/build/msbuild/libaegisub/libaegisub.vcxproj +++ b/aegisub/build/msbuild/libaegisub/libaegisub.vcxproj @@ -45,6 +45,7 @@ + @@ -86,6 +87,7 @@ + diff --git a/aegisub/build/msbuild/libaegisub/libaegisub.vcxproj.filters b/aegisub/build/msbuild/libaegisub/libaegisub.vcxproj.filters index 71fa5700e..d2d4de890 100644 --- a/aegisub/build/msbuild/libaegisub/libaegisub.vcxproj.filters +++ b/aegisub/build/msbuild/libaegisub/libaegisub.vcxproj.filters @@ -20,6 +20,9 @@ + + Header Files + Header Files @@ -130,6 +133,9 @@ Source Files\Common + + Source Files\Common + Source Files\Common diff --git a/aegisub/libaegisub/Makefile b/aegisub/libaegisub/Makefile index d8811a145..b321b9e02 100644 --- a/aegisub/libaegisub/Makefile +++ b/aegisub/libaegisub/Makefile @@ -24,6 +24,7 @@ SRC += \ common/cajun/reader.cpp \ common/cajun/writer.cpp \ common/charset.cpp \ + common/charset_6937.cpp \ common/charset_conv.cpp \ common/charset_ucd.cpp \ common/hotkey.cpp \ diff --git a/aegisub/libaegisub/common/charset_6937.cpp b/aegisub/libaegisub/common/charset_6937.cpp new file mode 100644 index 000000000..d86396e40 --- /dev/null +++ b/aegisub/libaegisub/common/charset_6937.cpp @@ -0,0 +1,250 @@ +// Copyright (c) 2012, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// +// $Id$ + +/// @file charset_6937.cpp +/// @brief A charset converter for ISO-6937-2 +/// @ingroup libaegisub + +#include "../config.h" + +#include "charset_6937.h" + +#ifndef LAGI_PRE +#include +#include +#endif + +#include + +namespace { + +// ISO-6937-2 values for the first 383 codepoints +const int iso6937_codepoints[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, + 0x05, 0x06, 0x07, 0x08, 0x09, + 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, + 0x0F, 0x10, 0x11, 0x12, 0x13, + 0x14, 0x15, 0x16, 0x17, 0x18, + 0x19, 0x1A, 0x1B, 0x1C, 0x1D, + 0x1E, 0x1F, 0x20, 0x21, 0x22, + 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2A, 0x2B, 0x2C, + 0x2D, 0x2E, 0x2F, 0x30, 0x31, + 0x32, 0x33, 0x34, 0x35, 0x36, + 0x37, 0x38, 0x39, 0x3A, 0x3B, + 0x3C, 0x3D, 0x3E, 0x3F, 0x40, + 0x41, 0x42, 0x43, 0x44, 0x45, + 0x46, 0x47, 0x48, 0x49, 0x4A, + 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, + 0x55, 0x56, 0x57, 0x58, 0x59, + 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, + 0x5F, 0x60, 0x61, 0x62, 0x63, + 0x64, 0x65, 0x66, 0x67, 0x68, + 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, + 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x7B, 0x7C, + 0x7D, 0x7E, 0x7F, 0x80, 0x81, + 0x82, 0x83, 0x84, 0x85, 0x86, + 0x87, 0x88, 0x89, 0x8A, 0x8B, + 0x8C, 0x8D, 0x8E, 0x8F, 0x90, + 0x91, 0x92, 0x93, 0x94, 0x95, + 0x96, 0x97, 0x98, 0x99, 0x9A, + 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, + 0xA0, 0xA1, 0xA2, 0xA3, 0xA8, + 0xA5, 0x00, 0xA7, 0xC820, 0xD3, + 0xE3, 0xAB, 0x00, 0x00, 0xD2, + 0xC520, 0xB0, 0xB1, 0xB2, 0xB3, + 0xC220, 0xB5, 0xB6, 0xB7, 0xCB20, + 0xD1, 0xEB, 0xBB, 0xBC, 0xBD, + 0xBE, 0xBF, 0xC141, 0xC241, 0xC341, + 0xC441, 0xC841, 0xCA41, 0xE1, 0xCB43, + 0xC145, 0xC245, 0xC345, 0xC845, 0xC149, + 0xC249, 0xC349, 0xC849, 0xE2, 0xC44E, + 0xC14F, 0xC24F, 0xC34F, 0xC44F, 0xC84F, + 0xB4, 0xE9, 0xC155, 0xC255, 0xC355, + 0xC855, 0xC259, 0xEC, 0xFB, 0xC161, + 0xC261, 0xC361, 0xC461, 0xC861, 0xCA61, + 0xF1, 0xCB63, 0xC165, 0xC265, 0xC365, + 0xC865, 0xC169, 0xC269, 0xC369, 0xC869, + 0xF3, 0xC46E, 0xC16F, 0xC26F, 0xC36F, + 0xC46F, 0xC86F, 0xB8, 0xF9, 0xC175, + 0xC275, 0xC375, 0xC875, 0xC279, 0xFC, + 0xC879, 0xC541, 0xC561, 0xC641, 0xC661, + 0xCE41, 0xCE61, 0xC243, 0xC263, 0xC343, + 0xC363, 0xC743, 0xC763, 0xCF43, 0xCF63, + 0xCF44, 0xCF64, 0x00, 0xF2, 0xC545, + 0xC565, 0x00, 0x00, 0xC745, 0xC765, + 0xCE45, 0xCE65, 0xCF45, 0xCF65, 0xC347, + 0xC367, 0xC647, 0xC667, 0xC747, 0xC767, + 0xCB47, 0xCB67, 0xC348, 0xC368, 0xE4, + 0xF4, 0xC449, 0xC469, 0xC549, 0xC569, + 0x00, 0x00, 0xCE49, 0xCE69, 0xC749, + 0xF5, 0xE6, 0xF6, 0xC34A, 0xC36A, + 0xCB4B, 0xCB6B, 0xF0, 0xC24C, 0xC26C, + 0xCB4C, 0xCB6C, 0xCF4C, 0xCF6C, 0xE7, + 0xF7, 0xE8, 0xF8, 0xC24E, 0xC26E, + 0xCB4E, 0xCB6E, 0xCF4E, 0xCF6E, 0xEF, + 0xEE, 0xFE, 0xC54F, 0xC56F, 0x00, + 0x00, 0xCD4F, 0xCD6F, 0xEA, 0xFA, + 0xC252, 0xC272, 0xCB52, 0xCB72, 0xCF52, + 0xCF72, 0xC253, 0xC273, 0xC353, 0xC373, + 0xCB53, 0xCB73, 0xCF53, 0xCF73, 0xCB54, + 0xCB74, 0xCF54, 0xCF74, 0xED, 0xFD, + 0xC455, 0xC475, 0xC555, 0xC575, 0xC655, + 0xC675, 0xCA55, 0xCA75, 0xCD55, 0xCD75, + 0xCE55, 0xCE75, 0xC357, 0xC377, 0xC359, + 0xC379, 0xC859, 0xC25A, 0xC27A, 0xC75A, + 0xC77A, 0xCF5A, 0xCF7A +}; + +struct extended_range { + const int codepoint; + const int value; +}; + +bool operator<(extended_range const& lft, extended_range const& rgt) { + return lft.codepoint < rgt.codepoint; +} + +bool operator<(int lft, extended_range const& rgt) { + return lft < rgt.codepoint; +} + +bool operator<(extended_range const& lft, int rgt) { + return lft.codepoint < rgt; +} + +// ISO-6937-2 values for codepoints that don't come in a nice contiguous block +const extended_range iso6937_extended_codepoints[] = { + { 0x02C7, 0xCF20 }, + { 0x02D8, 0xC620 }, + { 0x02D9, 0xC720 }, + { 0x02DA, 0xCA20 }, + { 0x02DB, 0xCE20 }, + { 0x02DD, 0xCD20 }, + { 0x2014, 0xD0 }, + { 0x2018, 0xA9 }, + { 0x2019, 0xB9 }, + { 0x201C, 0xAA }, + { 0x201D, 0xBA }, + { 0x2022, 0xD4 }, + { 0x20AC, 0xA4 }, // ETSI EN 300 468 extension: euro sign at A4 + { 0x2126, 0xE0 }, + { 0x215B, 0xDC }, + { 0x215C, 0xDD }, + { 0x215D, 0xDE }, + { 0x2190, 0xAC }, + { 0x2191, 0xAD }, + { 0x2192, 0xAE }, + { 0x2193, 0xAF }, + { 0x266A, 0xD5 } +}; + +#define countof(array) (sizeof(array) / sizeof((array)[0])) + +/// Get the ISO-6937-2 value for the given unicode codepoint or 0 if it cannot be mapped +int get_iso6937(int codepoint) { + if (static_cast(codepoint) < countof(iso6937_codepoints)) + return iso6937_codepoints[codepoint]; + + const extended_range *end = iso6937_extended_codepoints + countof(iso6937_extended_codepoints); + const extended_range *ext = std::lower_bound(iso6937_extended_codepoints, end, codepoint); + if (ext == end || ext->codepoint != codepoint) + return 0; + return ext->value; +} + +} // namespace { + +namespace agi { namespace charset { + +#ifdef _LIBICONV_VERSION +#define INTERNAL_CHARSET "UCS-4-INTERNAL" +#else +#define INTERNAL_CHARSET "WCHAR_T" +#endif + +Converter6937::Converter6937(bool subst, const char *src) +: to_ucs4(new IconvWrapper(src, INTERNAL_CHARSET)) +, subst(subst) +{ +} + +size_t Converter6937::Convert(const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { + // No state to reset + if (!inbuf || !inbytesleft) + return 0; + + size_t bytes_written = 0; + + while (*inbytesleft > 0) { + int in_val = 0; + + // Copy inbuf/inbytesleft so that we don't update them if the + // conversion fails (due to not enough space or a bad sequence) + const char *inbuftmp = *inbuf; + size_t inbyteslefttmp = *inbytesleft; + + char *val_buf = reinterpret_cast(&in_val); + size_t val_buf_size = sizeof(in_val); + + // Get the next unicode character from the input + size_t ret = to_ucs4->Convert(&inbuftmp, &inbyteslefttmp, &val_buf, &val_buf_size); + if (ret == (size_t)-1 && errno != E2BIG) + return ret; + + // And convert that to ISO-6937-2 + int val = get_iso6937(in_val); + if (!val && in_val) { + if (subst) { + val = '?'; + } + else { + errno = EILSEQ; + return (size_t)-1; + } + } + + if (*outbytesleft < 1 || (val > 255 && *outbytesleft < 2)) { + errno = E2BIG; + return (size_t)-1; + } + +#define WRITE_BYTE(b) \ + do { \ + *(*outbuf)++ = (b); \ + --*outbytesleft; \ + ++bytes_written; \ + } while(0) + + if (val <= 255) + WRITE_BYTE(val); + else { + WRITE_BYTE((val >> 8) & 0xFF); + WRITE_BYTE(val & 0xFF); + } + + // Update the input pointers now that the conversion has succeeded + *inbuf = inbuftmp; + *inbytesleft = inbyteslefttmp; + } + + return bytes_written; +} + +} } // namespace agi::charset diff --git a/aegisub/libaegisub/common/charset_6937.h b/aegisub/libaegisub/common/charset_6937.h new file mode 100644 index 000000000..60e9594c9 --- /dev/null +++ b/aegisub/libaegisub/common/charset_6937.h @@ -0,0 +1,46 @@ +// Copyright (c) 2012, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// +// $Id$ + +/// @file charset_6937.h +/// @brief A charset converter for ISO-6937-2 +/// @ingroup libaegisub + +#include + +namespace agi { namespace charset { + +/// @brief A charset converter for ISO-6937-2 +/// +/// While glibc iconv supports ISO-6937-2, GNU libiconv does not due to that +/// it's not used by anything but old subtitle formats +class Converter6937 : public Converter { + /// Converter to UCS-4 so that we only have to deal with unicode codepoints + agi::scoped_ptr to_ucs4; + + /// Should unsupported characters be replaced with '?' + const bool subst; + +public: + /// Constructor + /// @param subst Enable substitution for unsupported characters + /// @param src Source encoding + Converter6937(bool subst, const char *src); + + /// Convert a string. Interface is the same as iconv. + size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft); +}; + +} } diff --git a/aegisub/libaegisub/common/charset_conv.cpp b/aegisub/libaegisub/common/charset_conv.cpp index b255f41ea..50dff070b 100644 --- a/aegisub/libaegisub/common/charset_conv.cpp +++ b/aegisub/libaegisub/common/charset_conv.cpp @@ -31,6 +31,8 @@ #include #include +#include "charset_6937.h" + // Check if we can use advanced fallback capabilities added in GNU's iconv // implementation #if !defined(_LIBICONV_VERSION) || _LIBICONV_VERSION < 0x010A || defined(LIBICONV_PLUG) @@ -52,244 +54,251 @@ namespace { return strcmp(s1, s2) < 0; } }; -} + + agi::charset::Converter *get_converter(bool subst, const char *src, const char *dst); /// @brief Map a user-friendly encoding name to the real encoding name -static const char* GetRealEncodingName(const char* name) { - static std::map prettyNames; + const char* get_real_encoding_name(const char* name) { + static std::map pretty_names; - if (prettyNames.empty()) { -# define ADD(pretty, real) prettyNames[pretty] = real -# include -# undef ADD - } - - std::map::iterator real = prettyNames.find(name); - if (real != prettyNames.end()) { - return real->second; - } - return name; -} - - -namespace agi { - namespace charset { - -static size_t get_bom_size(iconv_t cd) { - // Most (but not all) iconv implementations automatically insert a BOM - // at the beginning of text converted to UTF-8, UTF-16 and UTF-32, but - // we usually don't want this, as some of the wxString using code - // assumes there is no BOM (as the exact encoding is known externally) - // As such, when doing conversions we will strip the BOM if it exists, - // then manually add it when writing files - - char buff[8]; - const char* src = ""; - char *dst = buff; - size_t srcLen = 1; - size_t dstLen = 8; - - size_t res = iconv(cd, ICONV_CONST_CAST(&src), &srcLen, &dst, &dstLen); - assert(res != iconv_failed); - assert(srcLen == 0); - - size_t size = 0; - for (src = buff; src < dst; ++src) { - if (*src) ++size; - } - if (size) { - // If there is a BOM, it will always be at least as big as the NUL - size = std::max(size, (8 - dstLen) / 2); - } - return size; -} - -static void eat_bom(iconv_t cd, size_t bomSize, const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) { - // If this encoding has a forced BOM (i.e. it's UTF-16 or UTF-32 without - // a specified byte order), skip over it - if (bomSize > 0 && inbytesleft && *inbytesleft) { - // libiconv marks the bom as written after writing the first - // character after the bom rather than when it writes the bom, so - // convert at least one extra character - char bom[8]; - char *dst = bom; - size_t dstSize = std::min((size_t)8, bomSize + *outbytesleft); - const char *src = *inbuf; - size_t srcSize = *inbytesleft; - iconv(cd, ICONV_CONST_CAST(&src), &srcSize, &dst, &dstSize); - } -} - -#ifdef ICONV_POSIX -class Converter { - size_t bomSize; - iconv_t cd; -public: - // subst is not used here because POSIX doesn't let you disable substitution - Converter(bool, const char* sourceEncoding, const char* destEncoding) - { - const char *dstEnc = GetRealEncodingName(destEncoding); - cd = iconv_open(dstEnc, "UTF-8"); - if (cd == iconv_invalid) { - throw UnsupportedConversion(std::string(dstEnc) + " is not a supported character set"); + if (pretty_names.empty()) { +# define ADD(pretty, real) pretty_names[pretty] = real +# include +# undef ADD } - bomSize = get_bom_size(cd); - iconv_close(cd); - cd = iconv_open(dstEnc, GetRealEncodingName(sourceEncoding)); - if (cd == iconv_invalid) { - throw UnsupportedConversion(std::string("Cannot convert from ") + sourceEncoding + " to " + destEncoding); - } + std::map::iterator real = pretty_names.find(name); + if (real != pretty_names.end()) + return real->second; + return name; } - ~Converter() { - if (cd != iconv_invalid) iconv_close(cd); - } - size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) { - eat_bom(cd, bomSize, inbuf, inbytesleft, outbuf, outbytesleft); - size_t res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft); + size_t get_bom_size(iconv_t cd) { + // Most (but not all) iconv implementations automatically insert a BOM + // at the beginning of text converted to UTF-8, UTF-16 and UTF-32, but + // we usually don't want this, as some of the wxString using code + // assumes there is no BOM (as the exact encoding is known externally) + // As such, when doing conversions we will strip the BOM if it exists, + // then manually add it when writing files - // This loop never does anything useful with a POSIX-compliant iconv - // implementation, but those don't seem to actually exist - while (res == iconv_failed && errno != E2BIG) { - ++*inbuf; - --*inbytesleft; - res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft); - } - - return res; - } -}; - -#else - -class Converter : public iconv_fallbacks { - size_t bomSize; - char invalidRep[8]; - size_t invalidRepSize; - iconv_t cd; - static void fallback( - unsigned int code, - void (*callback) (const char *buf, size_t buflen, void* callback_arg), - void *callback_arg, - void *convPtr) - { - // At some point in the future, this should probably switch to a real mapping - // For now, there's just three cases: BOM to nothing, '\' to itself - // (for Shift-JIS, which does not have \) and everything else to '?' - if (code == 0xFEFF) return; - if (code == 0x5C) callback("\\", 1, callback_arg); - else { - Converter *self = static_cast(convPtr); - callback(self->invalidRep, self->invalidRepSize, callback_arg); - } - } - Converter(Converter const&); - Converter& operator=(Converter const&); -public: - Converter(bool subst, const char* sourceEncoding, const char* destEncoding) - { - - const char *dstEnc = GetRealEncodingName(destEncoding); - cd = iconv_open(dstEnc, "UTF-8"); - if (cd == iconv_invalid) { - throw UnsupportedConversion(std::string(dstEnc) + " is not a supported character set"); - } - - bomSize = get_bom_size(cd); - - // Get fallback character - const char sbuff[] = "?"; - const char *src = sbuff; - char *dst = invalidRep; - size_t dstLen = 4; + char buff[8]; + const char* src = ""; + char *dst = buff; size_t srcLen = 1; + size_t dstLen = 8; - size_t res = Convert(&src, &srcLen, &dst, &dstLen); + size_t res = iconv(cd, ICONV_CONST_CAST(&src), &srcLen, &dst, &dstLen); assert(res != iconv_failed); assert(srcLen == 0); - invalidRepSize = 4 - dstLen; - - iconv_close(cd); - cd = iconv_open(dstEnc, GetRealEncodingName(sourceEncoding)); - if (cd == iconv_invalid) { - throw UnsupportedConversion(std::string("Cannot convert from ") + sourceEncoding + " to " + destEncoding); + size_t size = 0; + for (src = buff; src < dst; ++src) { + if (*src) ++size; } + if (size) { + // If there is a BOM, it will always be at least as big as the NUL + size = std::max(size, (8 - dstLen) / 2); + } + return size; + } - if (subst) { - data = this; - mb_to_uc_fallback = NULL; - mb_to_wc_fallback = NULL; - uc_to_mb_fallback = fallback; - wc_to_mb_fallback = NULL; - - int transliterate = 1; - iconvctl(cd, ICONV_SET_TRANSLITERATE, &transliterate); - iconvctl(cd, ICONV_SET_FALLBACKS, this); + void eat_bom(iconv_t cd, size_t bomSize, const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) { + // If this encoding has a forced BOM (i.e. it's UTF-16 or UTF-32 without + // a specified byte order), skip over it + if (bomSize > 0 && inbytesleft && *inbytesleft) { + // libiconv marks the bom as written after writing the first + // character after the bom rather than when it writes the bom, so + // convert at least one extra character + char bom[8]; + char *dst = bom; + size_t dstSize = std::min((size_t)8, bomSize + *outbytesleft); + const char *src = *inbuf; + size_t srcSize = *inbytesleft; + iconv(cd, ICONV_CONST_CAST(&src), &srcSize, &dst, &dstSize); } } - ~Converter() { - if (cd != iconv_invalid) iconv_close(cd); + + // Calculate the size of NUL in the given character set + size_t nul_size(const char* encoding) { + // We need a character set to convert from with a known encoding of NUL + // UTF-8 seems like the obvious choice + agi::scoped_ptr cd(get_converter(false, "UTF-8", encoding)); + + char dbuff[4]; + char sbuff[] = ""; + char* dst = dbuff; + const char* src = sbuff; + size_t dstLen = sizeof(dbuff); + size_t srcLen = 1; + + size_t ret = cd->Convert(&src, &srcLen, &dst, &dstLen); + assert(ret != iconv_failed); + assert(dst - dbuff > 0); + + return dst - dbuff; } - size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) { - eat_bom(cd, bomSize, inbuf, inbytesleft, outbuf, outbytesleft); - size_t res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft); - if (res == iconv_failed && errno == E2BIG && *outbytesleft == 0) { - // libiconv checks if there are any bytes left in the output buffer - // before checking if the conversion would actually write any - // characters to the output buffer, resulting in occasional invalid - // E2BIG false positives - char buff[8]; - size_t buffsize = 8; - char* out = buff; - const char* in = *inbuf; - size_t insize = *inbytesleft; +#ifdef ICONV_POSIX + class ConverterImpl : public agi::charset::Converter { + size_t bomSize; + iconv_t cd; + public: + // subst is not used here because POSIX doesn't let you disable substitution + ConverterImpl(bool, const char* sourceEncoding, const char* destEncoding) + { + const char *dstEnc = get_real_encoding_name(destEncoding); + cd = iconv_open(dstEnc, "UTF-8"); + if (cd == iconv_invalid) { + throw agi::charset::UnsupportedConversion(std::string(dstEnc) + " is not a supported character set"); + } - res = iconv(cd, ICONV_CONST_CAST(&in), &insize, &out, &buffsize); - // If no bytes of the output buffer were used, the original - // conversion may have been successful - if (buffsize != 8) { - errno = E2BIG; - res = iconv_failed; + bomSize = get_bom_size(cd); + iconv_close(cd); + cd = iconv_open(dstEnc, get_real_encoding_name(sourceEncoding)); + if (cd == iconv_invalid) { + throw agi::charset::UnsupportedConversion(std::string("Cannot convert from ") + sourceEncoding + " to " + destEncoding); } } + ~ConverterImpl() { + if (cd != iconv_invalid) iconv_close(cd); + } + size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) { + eat_bom(cd, bomSize, inbuf, inbytesleft, outbuf, outbytesleft); - return res; - } -}; + size_t res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft); + + // This loop never does anything useful with a POSIX-compliant iconv + // implementation, but those don't seem to actually exist + while (res == iconv_failed && errno != E2BIG) { + ++*inbuf; + --*inbytesleft; + res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft); + } + + return res; + } + }; + +#else + + class ConverterImpl : public iconv_fallbacks, public agi::charset::Converter { + size_t bomSize; + char invalidRep[8]; + size_t invalidRepSize; + iconv_t cd; + static void fallback( + unsigned int code, + void (*callback) (const char *buf, size_t buflen, void* callback_arg), + void *callback_arg, + void *convPtr) + { + // At some point in the future, this should probably switch to a real mapping + // For now, there's just three cases: BOM to nothing, '\' to itself + // (for Shift-JIS, which does not have \) and everything else to '?' + if (code == 0xFEFF) return; + if (code == 0x5C) callback("\\", 1, callback_arg); + else { + ConverterImpl *self = static_cast(convPtr); + callback(self->invalidRep, self->invalidRepSize, callback_arg); + } + } + ConverterImpl(ConverterImpl const&); + ConverterImpl& operator=(ConverterImpl const&); + public: + ConverterImpl(bool subst, const char* sourceEncoding, const char* destEncoding) + { + const char *dstEnc = get_real_encoding_name(destEncoding); + cd = iconv_open(dstEnc, "UTF-8"); + if (cd == iconv_invalid) + throw agi::charset::UnsupportedConversion(std::string(dstEnc) + " is not a supported character set"); + + bomSize = get_bom_size(cd); + + // Get fallback character + const char sbuff[] = "?"; + const char *src = sbuff; + char *dst = invalidRep; + size_t dstLen = 4; + size_t srcLen = 1; + + size_t res = Convert(&src, &srcLen, &dst, &dstLen); + assert(res != iconv_failed); + assert(srcLen == 0); + + invalidRepSize = 4 - dstLen; + + iconv_close(cd); + cd = iconv_open(dstEnc, get_real_encoding_name(sourceEncoding)); + if (cd == iconv_invalid) + throw agi::charset::UnsupportedConversion(std::string("Cannot convert from ") + sourceEncoding + " to " + destEncoding); + + if (subst) { + data = this; + mb_to_uc_fallback = NULL; + mb_to_wc_fallback = NULL; + uc_to_mb_fallback = fallback; + wc_to_mb_fallback = NULL; + + int transliterate = 1; + iconvctl(cd, ICONV_SET_TRANSLITERATE, &transliterate); + iconvctl(cd, ICONV_SET_FALLBACKS, this); + } + } + ~ConverterImpl() { + if (cd != iconv_invalid) iconv_close(cd); + } + size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) { + eat_bom(cd, bomSize, inbuf, inbytesleft, outbuf, outbytesleft); + size_t res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft); + + if (res == iconv_failed && errno == E2BIG && *outbytesleft == 0) { + // libiconv checks if there are any bytes left in the output buffer + // before checking if the conversion would actually write any + // characters to the output buffer, resulting in occasional invalid + // E2BIG false positives + char buff[8]; + size_t buffsize = 8; + char* out = buff; + const char* in = *inbuf; + size_t insize = *inbytesleft; + + res = iconv(cd, ICONV_CONST_CAST(&in), &insize, &out, &buffsize); + // If no bytes of the output buffer were used, the original + // conversion may have been successful + if (buffsize != 8) { + errno = E2BIG; + res = iconv_failed; + } + } + + return res; + } + }; #endif -// Calculate the size of NUL in the given character set -static size_t NulSize(const char* encoding) { - // We need a character set to convert from with a known encoding of NUL - // UTF-8 seems like the obvious choice - Converter cd(false, "UTF-8", encoding); + agi::charset::Converter *get_converter(bool subst, const char *src, const char *dst) { + try { + return new ConverterImpl(subst, src, dst); + } + catch (agi::charset::UnsupportedConversion const&) { + if (strcmp(dst, "ISO-6937-2")) + throw; + return new agi::charset::Converter6937(subst, src); + } + } +} // namespace { - char dbuff[4]; - char sbuff[] = ""; - char* dst = dbuff; - const char* src = sbuff; - size_t dstLen = sizeof(dbuff); - size_t srcLen = 1; - - size_t ret = cd.Convert(&src, &srcLen, &dst, &dstLen); - assert(ret != iconv_failed); - assert(dst - dbuff > 0); - - return dst - dbuff; -} +namespace agi { namespace charset { IconvWrapper::IconvWrapper(const char* sourceEncoding, const char* destEncoding, bool enableSubst) : toNulLen(0) , fromNulLen(0) -, conv(new Converter(enableSubst, sourceEncoding, destEncoding)) +, conv(get_converter(enableSubst, sourceEncoding, destEncoding)) { // These need to be set only after we verify that the source and dest // charsets are valid - toNulLen = NulSize(destEncoding); - fromNulLen = NulSize(sourceEncoding); + toNulLen = nul_size(destEncoding); + fromNulLen = nul_size(sourceEncoding); } IconvWrapper::~IconvWrapper() { } diff --git a/aegisub/libaegisub/include/libaegisub/charset_conv.h b/aegisub/libaegisub/include/libaegisub/charset_conv.h index a434f7359..f19f968d7 100644 --- a/aegisub/libaegisub/include/libaegisub/charset_conv.h +++ b/aegisub/libaegisub/include/libaegisub/charset_conv.h @@ -41,9 +41,12 @@ DEFINE_SIMPLE_EXCEPTION_NOINNER(BadOutput, ConversionFailure, "iconv/failed/EINV typedef void* iconv_t; -// Helper class that abstracts away the differences between libiconv and -// POSIX iconv implementations -class Converter; +/// Helper class that abstracts away the differences between libiconv and +/// POSIX iconv implementations +struct Converter { + virtual ~Converter() { } + virtual size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) = 0; +}; /// @brief A C++ wrapper for iconv class IconvWrapper { diff --git a/aegisub/tests/libaegisub_iconv.cpp b/aegisub/tests/libaegisub_iconv.cpp index bae7d59f3..dea9ecabf 100644 --- a/aegisub/tests/libaegisub_iconv.cpp +++ b/aegisub/tests/libaegisub_iconv.cpp @@ -150,3 +150,36 @@ TEST(lagi_iconv, Roundtrip) { "Jackdaws love my big sphinx of quartz"))); } } + +TEST(lagi_iconv, Iso6937) { + ASSERT_NO_THROW(IconvWrapper("UTF-8", "ISO-6937-2")); + IconvWrapper subst("UTF-8", "ISO-6937-2"); + IconvWrapper no_subst("UTF-8", "ISO-6937-2", false); + + // 7-bit is same as ISO-8859 + for (int i = 0; i < 128; ++i) { + const char buf[] = { i, 0 }; + std::string ret; + EXPECT_NO_THROW(ret = subst.Convert(buf)); + EXPECT_STREQ(buf, ret.c_str()); + } + + std::string ret; + + // LATIN CAPITAL LETTER D WITH CARON (U+010E) - multibyte char in main block + EXPECT_NO_THROW(ret = subst.Convert("\xC4\x8E")); + EXPECT_STREQ("\xCF\x44", ret.c_str()); + + // BREVE - multibyte char in extended ranges + EXPECT_NO_THROW(ret = subst.Convert("\xCB\x98")); + EXPECT_STREQ("\xC6\x20", ret.c_str()); + + // EM DASH - single byte char in extended ranges + EXPECT_NO_THROW(ret = subst.Convert("\xE2\x80\x94")); + EXPECT_STREQ("\xD0", ret.c_str()); + + // codepoint not in ISO-6937-2 + EXPECT_NO_THROW(ret = subst.Convert("\xCB\x97")); + EXPECT_STREQ("?", ret.c_str()); + EXPECT_THROW(no_subst.Convert("\xCB\x97"), agi::charset::BadOutput); +}