Add converter to ISO-6937-2

glibc's iconv implementation supports ISO-6937-2, but libiconv doesn't due to that these days the only place it's used is in a few old subtitle formats. As a result, on everything but linux we need our own converter. Conversion from ISO-6937-2 is currently not supported. Originally committed to SVN as r6632.
2025-04-11 22:56:02 +02:00 · 2012-03-29 19:04:49 +00:00 · 2012-03-29 19:04:49 +00:00 · f31d9a5a8b
commit f31d9a5a8b
parent 71776940f6
9 changed files with 566 additions and 208 deletions
--- a/aegisub/build/libaegisub_vs2008/libaegisub_vs2008.vcproj
+++ b/aegisub/build/libaegisub_vs2008/libaegisub_vs2008.vcproj
@ -267,6 +267,14 @@
 				RelativePath="..\..\libaegisub\common\charset.cpp"
 				>
 			</File>
+			<File
+				RelativePath="..\..\libaegisub\common\charset_6937.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\libaegisub\common\charset_6937.h"
+				>
+			</File>
 			<File
 				RelativePath="..\..\libaegisub\common\charset_conv.cpp"
 				>
--- a/aegisub/build/msbuild/libaegisub/libaegisub.vcxproj
+++ b/aegisub/build/msbuild/libaegisub/libaegisub.vcxproj
@ -45,6 +45,7 @@
  <ItemGroup>
    <ClInclude Include="$(SrcDir)lagi_pre.h" />
    <ClInclude Include="$(SrcDir)config.h" />
+    <ClInclude Include="$(SrcDir)common\charset_6937.h" />
    <ClInclude Include="$(SrcDir)common\charset_ucd.h" />
    <ClInclude Include="$(SrcDir)common\option_visit.h" />
    <ClInclude Include="$(SrcDir)include\libaegisub\access.h" />
@ -86,6 +87,7 @@
    <ClCompile Include="$(SrcDir)common\cajun\reader.cpp" />
    <ClCompile Include="$(SrcDir)common\cajun\writer.cpp" />
    <ClCompile Include="$(SrcDir)common\charset.cpp" />
+    <ClCompile Include="$(SrcDir)common\charset_6937.cpp" />
    <ClCompile Include="$(SrcDir)common\charset_conv.cpp" />
    <ClCompile Include="$(SrcDir)common\charset_ucd.cpp" />
    <ClCompile Include="$(SrcDir)common\hotkey.cpp" />
--- a/aegisub/build/msbuild/libaegisub/libaegisub.vcxproj.filters
+++ b/aegisub/build/msbuild/libaegisub/libaegisub.vcxproj.filters
@ -20,6 +20,9 @@
    </Filter>
  </ItemGroup>
  <ItemGroup>
+    <ClInclude Include="$(SrcDir)common\charset_6937.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
    <ClInclude Include="$(SrcDir)common\charset_ucd.h">
      <Filter>Header Files</Filter>
    </ClInclude>
@ -130,6 +133,9 @@
    <ClCompile Include="$(SrcDir)common\charset.cpp">
      <Filter>Source Files\Common</Filter>
    </ClCompile>
+    <ClCompile Include="$(SrcDir)common\charset_6937.cpp">
+      <Filter>Source Files\Common</Filter>
+    </ClCompile>
    <ClCompile Include="$(SrcDir)common\charset_conv.cpp">
      <Filter>Source Files\Common</Filter>
    </ClCompile>
--- a/aegisub/libaegisub/Makefile
+++ b/aegisub/libaegisub/Makefile
@ -24,6 +24,7 @@ SRC += \
 	common/cajun/reader.cpp \
 	common/cajun/writer.cpp \
 	common/charset.cpp \
+	common/charset_6937.cpp \
 	common/charset_conv.cpp \
 	common/charset_ucd.cpp \
 	common/hotkey.cpp \
--- a/aegisub/libaegisub/common/charset_6937.cpp
+++ b/aegisub/libaegisub/common/charset_6937.cpp
@ -0,0 +1,250 @@
+// Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
+//
+// Permission to use, copy, modify, and distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+//
+// $Id$
+
+/// @file charset_6937.cpp
+/// @brief A charset converter for ISO-6937-2
+/// @ingroup libaegisub
+
+#include "../config.h"
+
+#include "charset_6937.h"
+
+#ifndef LAGI_PRE
+#include <algorithm>
+#include <cerrno>
+#endif
+
+#include <iconv.h>
+
+namespace {
+
+// ISO-6937-2 values for the first 383 codepoints
+const int iso6937_codepoints[] = {
+	0x00,   0x01,   0x02,   0x03,   0x04,
+	0x05,   0x06,   0x07,   0x08,   0x09,
+	0x0A,   0x0B,   0x0C,   0x0D,   0x0E,
+	0x0F,   0x10,   0x11,   0x12,   0x13,
+	0x14,   0x15,   0x16,   0x17,   0x18,
+	0x19,   0x1A,   0x1B,   0x1C,   0x1D,
+	0x1E,   0x1F,   0x20,   0x21,   0x22,
+	0x23,   0x24,   0x25,   0x26,   0x27,
+	0x28,   0x29,   0x2A,   0x2B,   0x2C,
+	0x2D,   0x2E,   0x2F,   0x30,   0x31,
+	0x32,   0x33,   0x34,   0x35,   0x36,
+	0x37,   0x38,   0x39,   0x3A,   0x3B,
+	0x3C,   0x3D,   0x3E,   0x3F,   0x40,
+	0x41,   0x42,   0x43,   0x44,   0x45,
+	0x46,   0x47,   0x48,   0x49,   0x4A,
+	0x4B,   0x4C,   0x4D,   0x4E,   0x4F,
+	0x50,   0x51,   0x52,   0x53,   0x54,
+	0x55,   0x56,   0x57,   0x58,   0x59,
+	0x5A,   0x5B,   0x5C,   0x5D,   0x5E,
+	0x5F,   0x60,   0x61,   0x62,   0x63,
+	0x64,   0x65,   0x66,   0x67,   0x68,
+	0x69,   0x6A,   0x6B,   0x6C,   0x6D,
+	0x6E,   0x6F,   0x70,   0x71,   0x72,
+	0x73,   0x74,   0x75,   0x76,   0x77,
+	0x78,   0x79,   0x7A,   0x7B,   0x7C,
+	0x7D,   0x7E,   0x7F,   0x80,   0x81,
+	0x82,   0x83,   0x84,   0x85,   0x86,
+	0x87,   0x88,   0x89,   0x8A,   0x8B,
+	0x8C,   0x8D,   0x8E,   0x8F,   0x90,
+	0x91,   0x92,   0x93,   0x94,   0x95,
+	0x96,   0x97,   0x98,   0x99,   0x9A,
+	0x9B,   0x9C,   0x9D,   0x9E,   0x9F,
+	0xA0,   0xA1,   0xA2,   0xA3,   0xA8,
+	0xA5,   0x00,   0xA7,   0xC820, 0xD3,
+	0xE3,   0xAB,   0x00,   0x00,   0xD2,
+	0xC520, 0xB0,   0xB1,   0xB2,   0xB3,
+	0xC220, 0xB5,   0xB6,   0xB7,   0xCB20,
+	0xD1,   0xEB,   0xBB,   0xBC,   0xBD,
+	0xBE,   0xBF,   0xC141, 0xC241, 0xC341,
+	0xC441, 0xC841, 0xCA41, 0xE1,   0xCB43,
+	0xC145, 0xC245, 0xC345, 0xC845, 0xC149,
+	0xC249, 0xC349, 0xC849, 0xE2,   0xC44E,
+	0xC14F, 0xC24F, 0xC34F, 0xC44F, 0xC84F,
+	0xB4,   0xE9,   0xC155, 0xC255, 0xC355,
+	0xC855, 0xC259, 0xEC,   0xFB,   0xC161,
+	0xC261, 0xC361, 0xC461, 0xC861, 0xCA61,
+	0xF1,   0xCB63, 0xC165, 0xC265, 0xC365,
+	0xC865, 0xC169, 0xC269, 0xC369, 0xC869,
+	0xF3,   0xC46E, 0xC16F, 0xC26F, 0xC36F,
+	0xC46F, 0xC86F, 0xB8,   0xF9,   0xC175,
+	0xC275, 0xC375, 0xC875, 0xC279, 0xFC,
+	0xC879, 0xC541, 0xC561, 0xC641, 0xC661,
+	0xCE41, 0xCE61, 0xC243, 0xC263, 0xC343,
+	0xC363, 0xC743, 0xC763, 0xCF43, 0xCF63,
+	0xCF44, 0xCF64, 0x00,   0xF2,   0xC545,
+	0xC565, 0x00,   0x00,   0xC745, 0xC765,
+	0xCE45, 0xCE65, 0xCF45, 0xCF65, 0xC347,
+	0xC367, 0xC647, 0xC667, 0xC747, 0xC767,
+	0xCB47, 0xCB67, 0xC348, 0xC368, 0xE4,
+	0xF4,   0xC449, 0xC469, 0xC549, 0xC569,
+	0x00,   0x00,   0xCE49, 0xCE69, 0xC749,
+	0xF5,   0xE6,   0xF6,   0xC34A, 0xC36A,
+	0xCB4B, 0xCB6B, 0xF0,   0xC24C, 0xC26C,
+	0xCB4C, 0xCB6C, 0xCF4C, 0xCF6C, 0xE7,
+	0xF7,   0xE8,   0xF8,   0xC24E, 0xC26E,
+	0xCB4E, 0xCB6E, 0xCF4E, 0xCF6E, 0xEF,
+	0xEE,   0xFE,   0xC54F, 0xC56F, 0x00,
+	0x00,   0xCD4F, 0xCD6F, 0xEA,   0xFA,
+	0xC252, 0xC272, 0xCB52, 0xCB72, 0xCF52,
+	0xCF72, 0xC253, 0xC273, 0xC353, 0xC373,
+	0xCB53, 0xCB73, 0xCF53, 0xCF73, 0xCB54,
+	0xCB74, 0xCF54, 0xCF74, 0xED,   0xFD,
+	0xC455, 0xC475, 0xC555, 0xC575, 0xC655,
+	0xC675, 0xCA55, 0xCA75, 0xCD55, 0xCD75,
+	0xCE55, 0xCE75, 0xC357, 0xC377, 0xC359,
+	0xC379, 0xC859, 0xC25A, 0xC27A, 0xC75A,
+	0xC77A, 0xCF5A, 0xCF7A
+};
+
+struct extended_range {
+	const int codepoint;
+	const int value;
+};
+
+bool operator<(extended_range const& lft, extended_range const& rgt) {
+	return lft.codepoint < rgt.codepoint;
+}
+
+bool operator<(int lft, extended_range const& rgt) {
+	return lft < rgt.codepoint;
+}
+
+bool operator<(extended_range const& lft, int rgt) {
+	return lft.codepoint < rgt;
+}
+
+// ISO-6937-2 values for codepoints that don't come in a nice contiguous block
+const extended_range iso6937_extended_codepoints[] = {
+	{ 0x02C7, 0xCF20 },
+	{ 0x02D8, 0xC620 },
+	{ 0x02D9, 0xC720 },
+	{ 0x02DA, 0xCA20 },
+	{ 0x02DB, 0xCE20 },
+	{ 0x02DD, 0xCD20 },
+	{ 0x2014, 0xD0 },
+	{ 0x2018, 0xA9 },
+	{ 0x2019, 0xB9 },
+	{ 0x201C, 0xAA },
+	{ 0x201D, 0xBA },
+	{ 0x2022, 0xD4 },
+	{ 0x20AC, 0xA4 }, // ETSI EN 300 468 extension: euro sign at A4
+	{ 0x2126, 0xE0 },
+	{ 0x215B, 0xDC },
+	{ 0x215C, 0xDD },
+	{ 0x215D, 0xDE },
+	{ 0x2190, 0xAC },
+	{ 0x2191, 0xAD },
+	{ 0x2192, 0xAE },
+	{ 0x2193, 0xAF },
+	{ 0x266A, 0xD5 }
+};
+
+#define countof(array) (sizeof(array) / sizeof((array)[0]))
+
+/// Get the ISO-6937-2 value for the given unicode codepoint or 0 if it cannot be mapped
+int get_iso6937(int codepoint) {
+	if (static_cast<size_t>(codepoint) < countof(iso6937_codepoints))
+		return iso6937_codepoints[codepoint];
+
+	const extended_range *end = iso6937_extended_codepoints + countof(iso6937_extended_codepoints);
+	const extended_range *ext = std::lower_bound(iso6937_extended_codepoints, end, codepoint);
+	if (ext == end || ext->codepoint != codepoint)
+		return 0;
+	return ext->value;
+}
+
+} // namespace {
+
+namespace agi { namespace charset {
+
+#ifdef _LIBICONV_VERSION
+#define INTERNAL_CHARSET "UCS-4-INTERNAL"
+#else
+#define INTERNAL_CHARSET "WCHAR_T"
+#endif
+
+Converter6937::Converter6937(bool subst, const char *src)
+: to_ucs4(new IconvWrapper(src, INTERNAL_CHARSET))
+, subst(subst)
+{
+}
+
+size_t Converter6937::Convert(const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) {
+	// No state to reset
+	if (!inbuf || !inbytesleft)
+		return 0;
+
+	size_t bytes_written = 0;
+
+	while (*inbytesleft > 0) {
+		int in_val = 0;
+
+		// Copy inbuf/inbytesleft so that we don't update them if the
+		// conversion fails (due to not enough space or a bad sequence)
+		const char *inbuftmp = *inbuf;
+		size_t inbyteslefttmp = *inbytesleft;
+
+		char *val_buf = reinterpret_cast<char *>(&in_val);
+		size_t val_buf_size = sizeof(in_val);
+
+		// Get the next unicode character from the input
+		size_t ret = to_ucs4->Convert(&inbuftmp, &inbyteslefttmp, &val_buf, &val_buf_size);
+		if (ret == (size_t)-1 && errno != E2BIG)
+			return ret;
+
+		// And convert that to ISO-6937-2
+		int val = get_iso6937(in_val);
+		if (!val && in_val) {
+			if (subst) {
+				val = '?';
+			}
+			else {
+				errno = EILSEQ;
+				return (size_t)-1;
+			}
+		}
+
+		if (*outbytesleft < 1 || (val > 255 && *outbytesleft < 2)) {
+			errno = E2BIG;
+			return (size_t)-1;
+		}
+
+#define WRITE_BYTE(b) \
+		do { \
+			*(*outbuf)++ = (b); \
+			--*outbytesleft; \
+			++bytes_written; \
+		} while(0)
+
+		if (val <= 255)
+			WRITE_BYTE(val);
+		else {
+			WRITE_BYTE((val >> 8) & 0xFF);
+			WRITE_BYTE(val & 0xFF);
+		}
+
+		// Update the input pointers now that the conversion has succeeded
+		*inbuf = inbuftmp;
+		*inbytesleft = inbyteslefttmp;
+	}
+
+	return bytes_written;
+}
+
+} } // namespace agi::charset
--- a/aegisub/libaegisub/common/charset_6937.h
+++ b/aegisub/libaegisub/common/charset_6937.h
@ -0,0 +1,46 @@
+// Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
+//
+// Permission to use, copy, modify, and distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+//
+// $Id$
+
+/// @file charset_6937.h
+/// @brief A charset converter for ISO-6937-2
+/// @ingroup libaegisub
+
+#include <libaegisub/charset_conv.h>
+
+namespace agi { namespace charset {
+
+/// @brief A charset converter for ISO-6937-2
+///
+/// While glibc iconv supports ISO-6937-2, GNU libiconv does not due to that
+/// it's not used by anything but old subtitle formats
+class Converter6937 : public Converter {
+	/// Converter to UCS-4 so that we only have to deal with unicode codepoints
+	agi::scoped_ptr<IconvWrapper> to_ucs4;
+
+	/// Should unsupported characters be replaced with '?'
+	const bool subst;
+
+public:
+	/// Constructor
+	/// @param subst Enable substitution for unsupported characters
+	/// @param src Source encoding
+	Converter6937(bool subst, const char *src);
+
+	/// Convert a string. Interface is the same as iconv.
+	size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft);
+};
+
+} }
--- a/aegisub/libaegisub/common/charset_conv.cpp
+++ b/aegisub/libaegisub/common/charset_conv.cpp
@ -31,6 +31,8 @@
 #include <libaegisub/charset_conv.h>
 #include <iconv.h>

+#include "charset_6937.h"
+
 // Check if we can use advanced fallback capabilities added in GNU's iconv
 // implementation
 #if !defined(_LIBICONV_VERSION) || _LIBICONV_VERSION < 0x010A || defined(LIBICONV_PLUG)
@ -52,244 +54,251 @@ namespace {
 			return strcmp(s1, s2) < 0;
 		}
 	};
-}
+
+	agi::charset::Converter *get_converter(bool subst, const char *src, const char *dst);

 /// @brief Map a user-friendly encoding name to the real encoding name
-static const char* GetRealEncodingName(const char* name) {
-	static std::map<const char*, const char*, ltstr> prettyNames;
+	const char* get_real_encoding_name(const char* name) {
+		static std::map<const char*, const char*, ltstr> pretty_names;

-	if (prettyNames.empty()) {
-#		define ADD(pretty, real) prettyNames[pretty] = real
-#		include <libaegisub/charsets.def>
-#		undef ADD
-	}
-
-	std::map<const char*, const char*, ltstr>::iterator real = prettyNames.find(name);
-	if (real != prettyNames.end()) {
-		return real->second;
-	}
-	return name;
-}
-
-
-namespace agi {
-	namespace charset {
-
-static size_t get_bom_size(iconv_t cd) {
-	// Most (but not all) iconv implementations automatically insert a BOM
-	// at the beginning of text converted to UTF-8, UTF-16 and UTF-32, but
-	// we usually don't want this, as some of the wxString using code
-	// assumes there is no BOM (as the exact encoding is known externally)
-	// As such, when doing conversions we will strip the BOM if it exists,
-	// then manually add it when writing files
-
-	char buff[8];
-	const char* src = "";
-	char *dst = buff;
-	size_t srcLen = 1;
-	size_t dstLen = 8;
-
-	size_t res = iconv(cd, ICONV_CONST_CAST(&src), &srcLen, &dst, &dstLen);
-	assert(res != iconv_failed);
-	assert(srcLen == 0);
-
-	size_t size = 0;
-	for (src = buff; src < dst; ++src) {
-		if (*src) ++size;
-	}
-	if (size) {
-		// If there is a BOM, it will always be at least as big as the NUL
-		size = std::max(size, (8 - dstLen) / 2);
-	}
-	return size;
-}
-
-static void eat_bom(iconv_t cd, size_t bomSize, const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
-	// If this encoding has a forced BOM (i.e. it's UTF-16 or UTF-32 without
-	// a specified byte order), skip over it
-	if (bomSize > 0 && inbytesleft && *inbytesleft) {
-		// libiconv marks the bom as written after writing the first
-		// character after the bom rather than when it writes the bom, so
-		// convert at least one extra character
-		char bom[8];
-		char *dst = bom;
-		size_t dstSize = std::min((size_t)8, bomSize + *outbytesleft);
-		const char *src = *inbuf;
-		size_t srcSize = *inbytesleft;
-		iconv(cd, ICONV_CONST_CAST(&src), &srcSize, &dst, &dstSize);
-	}
-}
-
-#ifdef ICONV_POSIX
-class Converter {
-	size_t bomSize;
-	iconv_t cd;
-public:
-	// subst is not used here because POSIX doesn't let you disable substitution
-	Converter(bool, const char* sourceEncoding, const char* destEncoding)
-	{
-		const char *dstEnc = GetRealEncodingName(destEncoding);
-		cd = iconv_open(dstEnc, "UTF-8");
-		if (cd == iconv_invalid) {
-			throw UnsupportedConversion(std::string(dstEnc) + " is not a supported character set");
+		if (pretty_names.empty()) {
+#			define ADD(pretty, real) pretty_names[pretty] = real
+#			include <libaegisub/charsets.def>
+#			undef ADD
 		}

-		bomSize = get_bom_size(cd);
-		iconv_close(cd);
-		cd = iconv_open(dstEnc, GetRealEncodingName(sourceEncoding));
-		if (cd == iconv_invalid) {
-			throw UnsupportedConversion(std::string("Cannot convert from ") + sourceEncoding + " to " + destEncoding);
-		}
+		std::map<const char*, const char*, ltstr>::iterator real = pretty_names.find(name);
+		if (real != pretty_names.end())
+			return real->second;
+		return name;
 	}
-	~Converter() {
-		if (cd != iconv_invalid) iconv_close(cd);
-	}
-	size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
-		eat_bom(cd, bomSize, inbuf, inbytesleft, outbuf, outbytesleft);

-		size_t res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft);
+	size_t get_bom_size(iconv_t cd) {
+		// Most (but not all) iconv implementations automatically insert a BOM
+		// at the beginning of text converted to UTF-8, UTF-16 and UTF-32, but
+		// we usually don't want this, as some of the wxString using code
+		// assumes there is no BOM (as the exact encoding is known externally)
+		// As such, when doing conversions we will strip the BOM if it exists,
+		// then manually add it when writing files

-		// This loop never does anything useful with a POSIX-compliant iconv
-		// implementation, but those don't seem to actually exist
-		while (res == iconv_failed && errno != E2BIG) {
-			++*inbuf;
-			--*inbytesleft;
-			res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft);
-		}
-
-		return res;
-	}
-};
-
-#else
-
-class Converter : public iconv_fallbacks {
-	size_t bomSize;
-	char invalidRep[8];
-	size_t invalidRepSize;
-	iconv_t cd;
-	static void fallback(
-		unsigned int code,
-		void (*callback) (const char *buf, size_t buflen, void* callback_arg),
-		void *callback_arg,
-		void *convPtr)
-	{
-		// At some point in the future, this should probably switch to a real mapping
-		// For now, there's just three cases: BOM to nothing, '\' to itself
-		// (for Shift-JIS, which does not have \) and everything else to '?'
-		if (code == 0xFEFF) return;
-		if (code == 0x5C) callback("\\", 1, callback_arg);
-		else {
-			Converter *self = static_cast<Converter *>(convPtr);
-			callback(self->invalidRep, self->invalidRepSize, callback_arg);
-		}
-	}
-	Converter(Converter const&);
-	Converter& operator=(Converter const&);
-public:
-	Converter(bool subst, const char* sourceEncoding, const char* destEncoding)
-	{
-
-		const char *dstEnc = GetRealEncodingName(destEncoding);
-		cd = iconv_open(dstEnc, "UTF-8");
-		if (cd == iconv_invalid) {
-			throw UnsupportedConversion(std::string(dstEnc) + " is not a supported character set");
-		}
-
-		bomSize = get_bom_size(cd);
-
-		// Get fallback character
-		const char sbuff[] = "?";
-		const char *src = sbuff;
-		char *dst = invalidRep;
-		size_t dstLen = 4;
+		char buff[8];
+		const char* src = "";
+		char *dst = buff;
 		size_t srcLen = 1;
+		size_t dstLen = 8;

-		size_t res = Convert(&src, &srcLen, &dst, &dstLen);
+		size_t res = iconv(cd, ICONV_CONST_CAST(&src), &srcLen, &dst, &dstLen);
 		assert(res != iconv_failed);
 		assert(srcLen == 0);

-		invalidRepSize = 4 - dstLen;
-
-		iconv_close(cd);
-		cd = iconv_open(dstEnc, GetRealEncodingName(sourceEncoding));
-		if (cd == iconv_invalid) {
-			throw UnsupportedConversion(std::string("Cannot convert from ") + sourceEncoding + " to " + destEncoding);
+		size_t size = 0;
+		for (src = buff; src < dst; ++src) {
+			if (*src) ++size;
 		}
+		if (size) {
+			// If there is a BOM, it will always be at least as big as the NUL
+			size = std::max(size, (8 - dstLen) / 2);
+		}
+		return size;
+	}

-		if (subst) {
-			data = this;
-			mb_to_uc_fallback = NULL;
-			mb_to_wc_fallback = NULL;
-			uc_to_mb_fallback = fallback;
-			wc_to_mb_fallback = NULL;
-
-			int transliterate = 1;
-			iconvctl(cd, ICONV_SET_TRANSLITERATE, &transliterate);
-			iconvctl(cd, ICONV_SET_FALLBACKS, this);
+	void eat_bom(iconv_t cd, size_t bomSize, const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
+		// If this encoding has a forced BOM (i.e. it's UTF-16 or UTF-32 without
+		// a specified byte order), skip over it
+		if (bomSize > 0 && inbytesleft && *inbytesleft) {
+			// libiconv marks the bom as written after writing the first
+			// character after the bom rather than when it writes the bom, so
+			// convert at least one extra character
+			char bom[8];
+			char *dst = bom;
+			size_t dstSize = std::min((size_t)8, bomSize + *outbytesleft);
+			const char *src = *inbuf;
+			size_t srcSize = *inbytesleft;
+			iconv(cd, ICONV_CONST_CAST(&src), &srcSize, &dst, &dstSize);
 		}
 	}
-	~Converter() {
-		if (cd != iconv_invalid) iconv_close(cd);
+
+	// Calculate the size of NUL in the given character set
+	size_t nul_size(const char* encoding) {
+		// We need a character set to convert from with a known encoding of NUL
+		// UTF-8 seems like the obvious choice
+		agi::scoped_ptr<agi::charset::Converter> cd(get_converter(false, "UTF-8", encoding));
+
+		char dbuff[4];
+		char sbuff[] = "";
+		char* dst = dbuff;
+		const char* src = sbuff;
+		size_t dstLen = sizeof(dbuff);
+		size_t srcLen = 1;
+
+		size_t ret = cd->Convert(&src, &srcLen, &dst, &dstLen);
+		assert(ret != iconv_failed);
+		assert(dst - dbuff > 0);
+
+		return dst - dbuff;
 	}
-	size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
-		eat_bom(cd, bomSize, inbuf, inbytesleft, outbuf, outbytesleft);
-		size_t res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft);

-		if (res == iconv_failed && errno == E2BIG && *outbytesleft == 0) {
-			// libiconv checks if there are any bytes left in the output buffer
-			// before checking if the conversion would actually write any
-			// characters to the output buffer, resulting in occasional invalid
-			// E2BIG false positives
-			char buff[8];
-			size_t buffsize = 8;
-			char* out = buff;
-			const char* in = *inbuf;
-			size_t insize = *inbytesleft;
+#ifdef ICONV_POSIX
+	class ConverterImpl : public agi::charset::Converter {
+		size_t bomSize;
+		iconv_t cd;
+	public:
+		// subst is not used here because POSIX doesn't let you disable substitution
+		ConverterImpl(bool, const char* sourceEncoding, const char* destEncoding)
+		{
+			const char *dstEnc = get_real_encoding_name(destEncoding);
+			cd = iconv_open(dstEnc, "UTF-8");
+			if (cd == iconv_invalid) {
+				throw agi::charset::UnsupportedConversion(std::string(dstEnc) + " is not a supported character set");
+			}

-			res = iconv(cd, ICONV_CONST_CAST(&in), &insize, &out, &buffsize);
-			// If no bytes of the output buffer were used, the original
-			// conversion may have been successful
-			if (buffsize != 8) {
-				errno = E2BIG;
-				res = iconv_failed;
+			bomSize = get_bom_size(cd);
+			iconv_close(cd);
+			cd = iconv_open(dstEnc, get_real_encoding_name(sourceEncoding));
+			if (cd == iconv_invalid) {
+				throw agi::charset::UnsupportedConversion(std::string("Cannot convert from ") + sourceEncoding + " to " + destEncoding);
 			}
 		}
+		~ConverterImpl() {
+			if (cd != iconv_invalid) iconv_close(cd);
+		}
+		size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
+			eat_bom(cd, bomSize, inbuf, inbytesleft, outbuf, outbytesleft);

-		return res;
-	}
-};
+			size_t res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft);
+
+			// This loop never does anything useful with a POSIX-compliant iconv
+			// implementation, but those don't seem to actually exist
+			while (res == iconv_failed && errno != E2BIG) {
+				++*inbuf;
+				--*inbytesleft;
+				res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft);
+			}
+
+			return res;
+		}
+	};
+
+#else
+
+	class ConverterImpl : public iconv_fallbacks, public agi::charset::Converter {
+		size_t bomSize;
+		char invalidRep[8];
+		size_t invalidRepSize;
+		iconv_t cd;
+		static void fallback(
+			unsigned int code,
+			void (*callback) (const char *buf, size_t buflen, void* callback_arg),
+			void *callback_arg,
+			void *convPtr)
+		{
+			// At some point in the future, this should probably switch to a real mapping
+			// For now, there's just three cases: BOM to nothing, '\' to itself
+			// (for Shift-JIS, which does not have \) and everything else to '?'
+			if (code == 0xFEFF) return;
+			if (code == 0x5C) callback("\\", 1, callback_arg);
+			else {
+				ConverterImpl *self = static_cast<ConverterImpl *>(convPtr);
+				callback(self->invalidRep, self->invalidRepSize, callback_arg);
+			}
+		}
+		ConverterImpl(ConverterImpl const&);
+		ConverterImpl& operator=(ConverterImpl const&);
+	public:
+		ConverterImpl(bool subst, const char* sourceEncoding, const char* destEncoding)
+		{
+			const char *dstEnc = get_real_encoding_name(destEncoding);
+			cd = iconv_open(dstEnc, "UTF-8");
+			if (cd == iconv_invalid)
+				throw agi::charset::UnsupportedConversion(std::string(dstEnc) + " is not a supported character set");
+
+			bomSize = get_bom_size(cd);
+
+			// Get fallback character
+			const char sbuff[] = "?";
+			const char *src = sbuff;
+			char *dst = invalidRep;
+			size_t dstLen = 4;
+			size_t srcLen = 1;
+
+			size_t res = Convert(&src, &srcLen, &dst, &dstLen);
+			assert(res != iconv_failed);
+			assert(srcLen == 0);
+
+			invalidRepSize = 4 - dstLen;
+
+			iconv_close(cd);
+			cd = iconv_open(dstEnc, get_real_encoding_name(sourceEncoding));
+			if (cd == iconv_invalid)
+				throw agi::charset::UnsupportedConversion(std::string("Cannot convert from ") + sourceEncoding + " to " + destEncoding);
+
+			if (subst) {
+				data = this;
+				mb_to_uc_fallback = NULL;
+				mb_to_wc_fallback = NULL;
+				uc_to_mb_fallback = fallback;
+				wc_to_mb_fallback = NULL;
+
+				int transliterate = 1;
+				iconvctl(cd, ICONV_SET_TRANSLITERATE, &transliterate);
+				iconvctl(cd, ICONV_SET_FALLBACKS, this);
+			}
+		}
+		~ConverterImpl() {
+			if (cd != iconv_invalid) iconv_close(cd);
+		}
+		size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
+			eat_bom(cd, bomSize, inbuf, inbytesleft, outbuf, outbytesleft);
+			size_t res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft);
+
+			if (res == iconv_failed && errno == E2BIG && *outbytesleft == 0) {
+				// libiconv checks if there are any bytes left in the output buffer
+				// before checking if the conversion would actually write any
+				// characters to the output buffer, resulting in occasional invalid
+				// E2BIG false positives
+				char buff[8];
+				size_t buffsize = 8;
+				char* out = buff;
+				const char* in = *inbuf;
+				size_t insize = *inbytesleft;
+
+				res = iconv(cd, ICONV_CONST_CAST(&in), &insize, &out, &buffsize);
+				// If no bytes of the output buffer were used, the original
+				// conversion may have been successful
+				if (buffsize != 8) {
+					errno = E2BIG;
+					res = iconv_failed;
+				}
+			}
+
+			return res;
+		}
+	};
 #endif

-// Calculate the size of NUL in the given character set
-static size_t NulSize(const char* encoding) {
-	// We need a character set to convert from with a known encoding of NUL
-	// UTF-8 seems like the obvious choice
-	Converter cd(false, "UTF-8", encoding);
+	agi::charset::Converter *get_converter(bool subst, const char *src, const char *dst) {
+		try {
+			return new ConverterImpl(subst, src, dst);
+		}
+		catch (agi::charset::UnsupportedConversion const&) {
+			if (strcmp(dst, "ISO-6937-2"))
+				throw;
+			return new agi::charset::Converter6937(subst, src);
+		}
+	}
+} // namespace {

-	char dbuff[4];
-	char sbuff[] = "";
-	char* dst = dbuff;
-	const char* src = sbuff;
-	size_t dstLen = sizeof(dbuff);
-	size_t srcLen = 1;
-
-	size_t ret = cd.Convert(&src, &srcLen, &dst, &dstLen);
-	assert(ret != iconv_failed);
-	assert(dst - dbuff > 0);
-
-	return dst - dbuff;
-}
+namespace agi { namespace charset {

 IconvWrapper::IconvWrapper(const char* sourceEncoding, const char* destEncoding, bool enableSubst)
 : toNulLen(0)
 , fromNulLen(0)
-, conv(new Converter(enableSubst, sourceEncoding, destEncoding))
+, conv(get_converter(enableSubst, sourceEncoding, destEncoding))
 {
 	// These need to be set only after we verify that the source and dest
 	// charsets are valid
-	toNulLen = NulSize(destEncoding);
-	fromNulLen = NulSize(sourceEncoding);
+	toNulLen = nul_size(destEncoding);
+	fromNulLen = nul_size(sourceEncoding);
 }
 IconvWrapper::~IconvWrapper() {
 }
--- a/aegisub/libaegisub/include/libaegisub/charset_conv.h
+++ b/aegisub/libaegisub/include/libaegisub/charset_conv.h
@ -41,9 +41,12 @@ DEFINE_SIMPLE_EXCEPTION_NOINNER(BadOutput, ConversionFailure, "iconv/failed/EINV

 typedef void* iconv_t;

-// Helper class that abstracts away the differences between libiconv and
-// POSIX iconv implementations
-class Converter;
+/// Helper class that abstracts away the differences between libiconv and
+/// POSIX iconv implementations
+struct Converter {
+	virtual ~Converter() { }
+	virtual size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) = 0;
+};

 /// @brief A C++ wrapper for iconv
 class IconvWrapper {
--- a/aegisub/tests/libaegisub_iconv.cpp
+++ b/aegisub/tests/libaegisub_iconv.cpp
@ -150,3 +150,36 @@ TEST(lagi_iconv, Roundtrip) {
 					"Jackdaws love my big sphinx of quartz")));
 	}
 }
+
+TEST(lagi_iconv, Iso6937) {
+	ASSERT_NO_THROW(IconvWrapper("UTF-8", "ISO-6937-2"));
+	IconvWrapper subst("UTF-8", "ISO-6937-2");
+	IconvWrapper no_subst("UTF-8", "ISO-6937-2", false);
+
+	// 7-bit is same as ISO-8859
+	for (int i = 0; i < 128; ++i) {
+		const char buf[] = { i, 0 };
+		std::string ret;
+		EXPECT_NO_THROW(ret = subst.Convert(buf));
+		EXPECT_STREQ(buf, ret.c_str());
+	}
+
+	std::string ret;
+
+	// LATIN CAPITAL LETTER D WITH CARON (U+010E) - multibyte char in main block
+	EXPECT_NO_THROW(ret = subst.Convert("\xC4\x8E"));
+	EXPECT_STREQ("\xCF\x44", ret.c_str());
+
+	// BREVE - multibyte char in extended ranges
+	EXPECT_NO_THROW(ret = subst.Convert("\xCB\x98"));
+	EXPECT_STREQ("\xC6\x20", ret.c_str());
+
+	// EM DASH - single byte char in extended ranges
+	EXPECT_NO_THROW(ret = subst.Convert("\xE2\x80\x94"));
+	EXPECT_STREQ("\xD0", ret.c_str());
+
+	// codepoint not in ISO-6937-2
+	EXPECT_NO_THROW(ret = subst.Convert("\xCB\x97"));
+	EXPECT_STREQ("?", ret.c_str());
+	EXPECT_THROW(no_subst.Convert("\xCB\x97"), agi::charset::BadOutput);
+}