From 10a88dfb52f1e62a02f98d3936335a75f3b7a20b Mon Sep 17 00:00:00 2001 From: Thomas Goyne Date: Fri, 1 Feb 2013 09:29:34 -0800 Subject: [PATCH] Use boost.locale for spellchecker word splitting This is currently only correct if the UI language is the same as the subtitles language as the global locale is used, but it should still never be worse than the small hardcoded table of word splitters. Closes #1206. --- aegisub/libaegisub/ass/dialogue_parser.cpp | 72 +++++----------------- aegisub/tests/tests/word_split.cpp | 24 +++++++- 2 files changed, 37 insertions(+), 59 deletions(-) diff --git a/aegisub/libaegisub/ass/dialogue_parser.cpp b/aegisub/libaegisub/ass/dialogue_parser.cpp index 1babd713e..921518c88 100644 --- a/aegisub/libaegisub/ass/dialogue_parser.cpp +++ b/aegisub/libaegisub/ass/dialogue_parser.cpp @@ -18,11 +18,12 @@ #include "libaegisub/ass/dialogue_parser.h" -#include "libaegisub/scoped_ptr.h" #include "libaegisub/spellchecker.h" #include "iconv.h" +#include + namespace { typedef std::vector TokenVec; @@ -94,70 +95,28 @@ public: class WordSplitter { std::string const& text; std::vector &tokens; - agi::scoped_holder utf8_to_utf32; size_t pos; - bool IsWordSep(int chr) { - static const int delims[] = { - 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, - 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x003a, - 0x003b, 0x003d, 0x003f, 0x0040, 0x005b, 0x005c, 0x005d, 0x005e, - 0x005f, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 0x00a1, 0x00a2, - 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00aa, 0x00ab, - 0x00b0, 0x00b6, 0x00b7, 0x00ba, 0x00bb, 0x00bf, 0x02dc, 0x0e3f, - 0x2010, 0x2013, 0x2014, 0x2015, 0x2018, 0x2019, 0x201c, 0x201d, - 0x2020, 0x2021, 0x2022, 0x2025, 0x2026, 0x2026, 0x2030, 0x2031, - 0x2032, 0x203b, 0x203b, 0x203d, 0x2042, 0x2044, 0x20a6, 0x20a9, - 0x20aa, 0x20ac, 0x20ad, 0x2116, 0x2234, 0x2235, 0x2420, 0x2422, - 0x2423, 0x2506, 0x25ca, 0x2605, 0x261e, 0x2e2e, 0x3000, 0x3001, - 0x3002, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c, 0x300d, 0x300e, - 0x300f, 0x3010, 0x3011, 0x3014, 0x3015, 0x3016, 0x3017, 0x3018, - 0x3019, 0x301a, 0x301b, 0x301c, 0x3030, 0x303d, 0x30fb, 0xff0a, - 0xff5b, 0xff5d, 0xff5e - }; - - return std::binary_search(std::begin(delims), std::end(delims), chr); - } - - int NextChar(int pos, int len, int& char_len) { - int chr = 0; - char *inptr = const_cast(&text[pos]); - size_t inlen = len; - char *outptr = (char *)&chr; - size_t outlen = sizeof chr; - - iconv(utf8_to_utf32, &inptr, &inlen, &outptr, &outlen); - if (outlen != 0) - return 0; - - char_len = len - inlen; - return chr; - } - void SwitchTo(size_t &i, int type, int len) { - if (tokens[i].type == type) return; + auto old = tokens[i]; + tokens[i].type = type; + tokens[i].length = len; - if (tokens[i].length == (size_t)len) - tokens[i].type = type; - else { - tokens.insert(tokens.begin() + i + 1, DialogueToken(type, len)); - tokens[i].length -= len; + if (old.length != (size_t)len) { + tokens.insert(tokens.begin() + i + 1, DialogueToken(old.type, old.length - len)); ++i; } } void SplitText(size_t &i) { - int chrlen = 0; - int len = tokens[i].length; - int tpos = pos; - for (; len > 0; tpos += chrlen, len -= chrlen) { - int chr = NextChar(tpos, len, chrlen); - if (!chr) return; - - if (IsWordSep(chr)) - SwitchTo(i, dt::TEXT, len); - else + using namespace boost::locale::boundary; + ssegment_index map(word, text.begin() + pos, text.begin() + pos + tokens[i].length); + for (auto const& segment : map) { + int len = distance(begin(segment), end(segment)); + if (segment.rule() & word_letters) SwitchTo(i, dt::WORD, len); + else + SwitchTo(i, dt::TEXT, len); } } @@ -165,7 +124,6 @@ public: WordSplitter(std::string const& text, std::vector &tokens) : text(text) , tokens(tokens) - , utf8_to_utf32(iconv_open("utf-32le", "utf-8"), iconv_close) , pos(0) { } @@ -175,7 +133,7 @@ public: for (size_t i = 0; i < tokens.size(); ++i) { size_t len = tokens[i].length; if (tokens[i].type == dt::TEXT) - SplitText(i); + SplitText(i); pos += len; } } diff --git a/aegisub/tests/tests/word_split.cpp b/aegisub/tests/tests/word_split.cpp index 3ffb3a3be..4c19de763 100644 --- a/aegisub/tests/tests/word_split.cpp +++ b/aegisub/tests/tests/word_split.cpp @@ -126,10 +126,11 @@ TEST(lagi_word_split, unclosed_ovr) { }; SplitWords(text, tokens); - ASSERT_EQ(3u, tokens.size()); + ASSERT_EQ(4u, tokens.size()); EXPECT_EQ(dt::WORD, tokens[0].type); EXPECT_EQ(dt::TEXT, tokens[1].type); - EXPECT_EQ(dt::WORD, tokens[2].type); + EXPECT_EQ(dt::TEXT, tokens[2].type); + EXPECT_EQ(dt::WORD, tokens[3].type); text = "{"; tokens.clear(); @@ -140,3 +141,22 @@ TEST(lagi_word_split, unclosed_ovr) { EXPECT_EQ(dt::TEXT, tokens[0].type); } +TEST(lagi_word_split, several_words) { + std::string text = "a bb ccc dd e"; + std::vector tokens = { + {dt::TEXT, 13}, + }; + + SplitWords(text, tokens); + ASSERT_EQ(9u, tokens.size()); + EXPECT_EQ(1, tokens[0].length); + EXPECT_EQ(1, tokens[1].length); + EXPECT_EQ(2, tokens[2].length); + EXPECT_EQ(1, tokens[3].length); + EXPECT_EQ(3, tokens[4].length); + EXPECT_EQ(1, tokens[5].length); + EXPECT_EQ(2, tokens[6].length); + EXPECT_EQ(1, tokens[7].length); + EXPECT_EQ(1, tokens[8].length); +} +