diff --git a/aegisub/libaegisub/ass/dialogue_parser.cpp b/aegisub/libaegisub/ass/dialogue_parser.cpp index 1babd713e..921518c88 100644 --- a/aegisub/libaegisub/ass/dialogue_parser.cpp +++ b/aegisub/libaegisub/ass/dialogue_parser.cpp @@ -18,11 +18,12 @@ #include "libaegisub/ass/dialogue_parser.h" -#include "libaegisub/scoped_ptr.h" #include "libaegisub/spellchecker.h" #include "iconv.h" +#include + namespace { typedef std::vector TokenVec; @@ -94,70 +95,28 @@ public: class WordSplitter { std::string const& text; std::vector &tokens; - agi::scoped_holder utf8_to_utf32; size_t pos; - bool IsWordSep(int chr) { - static const int delims[] = { - 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, - 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x003a, - 0x003b, 0x003d, 0x003f, 0x0040, 0x005b, 0x005c, 0x005d, 0x005e, - 0x005f, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 0x00a1, 0x00a2, - 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00aa, 0x00ab, - 0x00b0, 0x00b6, 0x00b7, 0x00ba, 0x00bb, 0x00bf, 0x02dc, 0x0e3f, - 0x2010, 0x2013, 0x2014, 0x2015, 0x2018, 0x2019, 0x201c, 0x201d, - 0x2020, 0x2021, 0x2022, 0x2025, 0x2026, 0x2026, 0x2030, 0x2031, - 0x2032, 0x203b, 0x203b, 0x203d, 0x2042, 0x2044, 0x20a6, 0x20a9, - 0x20aa, 0x20ac, 0x20ad, 0x2116, 0x2234, 0x2235, 0x2420, 0x2422, - 0x2423, 0x2506, 0x25ca, 0x2605, 0x261e, 0x2e2e, 0x3000, 0x3001, - 0x3002, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c, 0x300d, 0x300e, - 0x300f, 0x3010, 0x3011, 0x3014, 0x3015, 0x3016, 0x3017, 0x3018, - 0x3019, 0x301a, 0x301b, 0x301c, 0x3030, 0x303d, 0x30fb, 0xff0a, - 0xff5b, 0xff5d, 0xff5e - }; - - return std::binary_search(std::begin(delims), std::end(delims), chr); - } - - int NextChar(int pos, int len, int& char_len) { - int chr = 0; - char *inptr = const_cast(&text[pos]); - size_t inlen = len; - char *outptr = (char *)&chr; - size_t outlen = sizeof chr; - - iconv(utf8_to_utf32, &inptr, &inlen, &outptr, &outlen); - if (outlen != 0) - return 0; - - char_len = len - inlen; - return chr; - } - void SwitchTo(size_t &i, int type, int len) { - if (tokens[i].type == type) return; + auto old = tokens[i]; + tokens[i].type = type; + tokens[i].length = len; - if (tokens[i].length == (size_t)len) - tokens[i].type = type; - else { - tokens.insert(tokens.begin() + i + 1, DialogueToken(type, len)); - tokens[i].length -= len; + if (old.length != (size_t)len) { + tokens.insert(tokens.begin() + i + 1, DialogueToken(old.type, old.length - len)); ++i; } } void SplitText(size_t &i) { - int chrlen = 0; - int len = tokens[i].length; - int tpos = pos; - for (; len > 0; tpos += chrlen, len -= chrlen) { - int chr = NextChar(tpos, len, chrlen); - if (!chr) return; - - if (IsWordSep(chr)) - SwitchTo(i, dt::TEXT, len); - else + using namespace boost::locale::boundary; + ssegment_index map(word, text.begin() + pos, text.begin() + pos + tokens[i].length); + for (auto const& segment : map) { + int len = distance(begin(segment), end(segment)); + if (segment.rule() & word_letters) SwitchTo(i, dt::WORD, len); + else + SwitchTo(i, dt::TEXT, len); } } @@ -165,7 +124,6 @@ public: WordSplitter(std::string const& text, std::vector &tokens) : text(text) , tokens(tokens) - , utf8_to_utf32(iconv_open("utf-32le", "utf-8"), iconv_close) , pos(0) { } @@ -175,7 +133,7 @@ public: for (size_t i = 0; i < tokens.size(); ++i) { size_t len = tokens[i].length; if (tokens[i].type == dt::TEXT) - SplitText(i); + SplitText(i); pos += len; } } diff --git a/aegisub/tests/tests/word_split.cpp b/aegisub/tests/tests/word_split.cpp index 3ffb3a3be..4c19de763 100644 --- a/aegisub/tests/tests/word_split.cpp +++ b/aegisub/tests/tests/word_split.cpp @@ -126,10 +126,11 @@ TEST(lagi_word_split, unclosed_ovr) { }; SplitWords(text, tokens); - ASSERT_EQ(3u, tokens.size()); + ASSERT_EQ(4u, tokens.size()); EXPECT_EQ(dt::WORD, tokens[0].type); EXPECT_EQ(dt::TEXT, tokens[1].type); - EXPECT_EQ(dt::WORD, tokens[2].type); + EXPECT_EQ(dt::TEXT, tokens[2].type); + EXPECT_EQ(dt::WORD, tokens[3].type); text = "{"; tokens.clear(); @@ -140,3 +141,22 @@ TEST(lagi_word_split, unclosed_ovr) { EXPECT_EQ(dt::TEXT, tokens[0].type); } +TEST(lagi_word_split, several_words) { + std::string text = "a bb ccc dd e"; + std::vector tokens = { + {dt::TEXT, 13}, + }; + + SplitWords(text, tokens); + ASSERT_EQ(9u, tokens.size()); + EXPECT_EQ(1, tokens[0].length); + EXPECT_EQ(1, tokens[1].length); + EXPECT_EQ(2, tokens[2].length); + EXPECT_EQ(1, tokens[3].length); + EXPECT_EQ(3, tokens[4].length); + EXPECT_EQ(1, tokens[5].length); + EXPECT_EQ(2, tokens[6].length); + EXPECT_EQ(1, tokens[7].length); + EXPECT_EQ(1, tokens[8].length); +} +