mirror of https://github.com/odrling/Aegisub
Use boost.locale for spellchecker word splitting
This is currently only correct if the UI language is the same as the subtitles language as the global locale is used, but it should still never be worse than the small hardcoded table of word splitters. Closes #1206.
This commit is contained in:
parent
5efba3fda1
commit
10a88dfb52
|
@ -18,11 +18,12 @@
|
||||||
|
|
||||||
#include "libaegisub/ass/dialogue_parser.h"
|
#include "libaegisub/ass/dialogue_parser.h"
|
||||||
|
|
||||||
#include "libaegisub/scoped_ptr.h"
|
|
||||||
#include "libaegisub/spellchecker.h"
|
#include "libaegisub/spellchecker.h"
|
||||||
|
|
||||||
#include "iconv.h"
|
#include "iconv.h"
|
||||||
|
|
||||||
|
#include <boost/locale/boundary.hpp>
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
typedef std::vector<agi::ass::DialogueToken> TokenVec;
|
typedef std::vector<agi::ass::DialogueToken> TokenVec;
|
||||||
|
@ -94,70 +95,28 @@ public:
|
||||||
class WordSplitter {
|
class WordSplitter {
|
||||||
std::string const& text;
|
std::string const& text;
|
||||||
std::vector<DialogueToken> &tokens;
|
std::vector<DialogueToken> &tokens;
|
||||||
agi::scoped_holder<iconv_t, int(&)(iconv_t)> utf8_to_utf32;
|
|
||||||
size_t pos;
|
size_t pos;
|
||||||
|
|
||||||
bool IsWordSep(int chr) {
|
|
||||||
static const int delims[] = {
|
|
||||||
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028,
|
|
||||||
0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x003a,
|
|
||||||
0x003b, 0x003d, 0x003f, 0x0040, 0x005b, 0x005c, 0x005d, 0x005e,
|
|
||||||
0x005f, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 0x00a1, 0x00a2,
|
|
||||||
0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00aa, 0x00ab,
|
|
||||||
0x00b0, 0x00b6, 0x00b7, 0x00ba, 0x00bb, 0x00bf, 0x02dc, 0x0e3f,
|
|
||||||
0x2010, 0x2013, 0x2014, 0x2015, 0x2018, 0x2019, 0x201c, 0x201d,
|
|
||||||
0x2020, 0x2021, 0x2022, 0x2025, 0x2026, 0x2026, 0x2030, 0x2031,
|
|
||||||
0x2032, 0x203b, 0x203b, 0x203d, 0x2042, 0x2044, 0x20a6, 0x20a9,
|
|
||||||
0x20aa, 0x20ac, 0x20ad, 0x2116, 0x2234, 0x2235, 0x2420, 0x2422,
|
|
||||||
0x2423, 0x2506, 0x25ca, 0x2605, 0x261e, 0x2e2e, 0x3000, 0x3001,
|
|
||||||
0x3002, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c, 0x300d, 0x300e,
|
|
||||||
0x300f, 0x3010, 0x3011, 0x3014, 0x3015, 0x3016, 0x3017, 0x3018,
|
|
||||||
0x3019, 0x301a, 0x301b, 0x301c, 0x3030, 0x303d, 0x30fb, 0xff0a,
|
|
||||||
0xff5b, 0xff5d, 0xff5e
|
|
||||||
};
|
|
||||||
|
|
||||||
return std::binary_search(std::begin(delims), std::end(delims), chr);
|
|
||||||
}
|
|
||||||
|
|
||||||
int NextChar(int pos, int len, int& char_len) {
|
|
||||||
int chr = 0;
|
|
||||||
char *inptr = const_cast<char *>(&text[pos]);
|
|
||||||
size_t inlen = len;
|
|
||||||
char *outptr = (char *)&chr;
|
|
||||||
size_t outlen = sizeof chr;
|
|
||||||
|
|
||||||
iconv(utf8_to_utf32, &inptr, &inlen, &outptr, &outlen);
|
|
||||||
if (outlen != 0)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
char_len = len - inlen;
|
|
||||||
return chr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void SwitchTo(size_t &i, int type, int len) {
|
void SwitchTo(size_t &i, int type, int len) {
|
||||||
if (tokens[i].type == type) return;
|
auto old = tokens[i];
|
||||||
|
|
||||||
if (tokens[i].length == (size_t)len)
|
|
||||||
tokens[i].type = type;
|
tokens[i].type = type;
|
||||||
else {
|
tokens[i].length = len;
|
||||||
tokens.insert(tokens.begin() + i + 1, DialogueToken(type, len));
|
|
||||||
tokens[i].length -= len;
|
if (old.length != (size_t)len) {
|
||||||
|
tokens.insert(tokens.begin() + i + 1, DialogueToken(old.type, old.length - len));
|
||||||
++i;
|
++i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void SplitText(size_t &i) {
|
void SplitText(size_t &i) {
|
||||||
int chrlen = 0;
|
using namespace boost::locale::boundary;
|
||||||
int len = tokens[i].length;
|
ssegment_index map(word, text.begin() + pos, text.begin() + pos + tokens[i].length);
|
||||||
int tpos = pos;
|
for (auto const& segment : map) {
|
||||||
for (; len > 0; tpos += chrlen, len -= chrlen) {
|
int len = distance(begin(segment), end(segment));
|
||||||
int chr = NextChar(tpos, len, chrlen);
|
if (segment.rule() & word_letters)
|
||||||
if (!chr) return;
|
|
||||||
|
|
||||||
if (IsWordSep(chr))
|
|
||||||
SwitchTo(i, dt::TEXT, len);
|
|
||||||
else
|
|
||||||
SwitchTo(i, dt::WORD, len);
|
SwitchTo(i, dt::WORD, len);
|
||||||
|
else
|
||||||
|
SwitchTo(i, dt::TEXT, len);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -165,7 +124,6 @@ public:
|
||||||
WordSplitter(std::string const& text, std::vector<DialogueToken> &tokens)
|
WordSplitter(std::string const& text, std::vector<DialogueToken> &tokens)
|
||||||
: text(text)
|
: text(text)
|
||||||
, tokens(tokens)
|
, tokens(tokens)
|
||||||
, utf8_to_utf32(iconv_open("utf-32le", "utf-8"), iconv_close)
|
|
||||||
, pos(0)
|
, pos(0)
|
||||||
{ }
|
{ }
|
||||||
|
|
||||||
|
|
|
@ -126,10 +126,11 @@ TEST(lagi_word_split, unclosed_ovr) {
|
||||||
};
|
};
|
||||||
|
|
||||||
SplitWords(text, tokens);
|
SplitWords(text, tokens);
|
||||||
ASSERT_EQ(3u, tokens.size());
|
ASSERT_EQ(4u, tokens.size());
|
||||||
EXPECT_EQ(dt::WORD, tokens[0].type);
|
EXPECT_EQ(dt::WORD, tokens[0].type);
|
||||||
EXPECT_EQ(dt::TEXT, tokens[1].type);
|
EXPECT_EQ(dt::TEXT, tokens[1].type);
|
||||||
EXPECT_EQ(dt::WORD, tokens[2].type);
|
EXPECT_EQ(dt::TEXT, tokens[2].type);
|
||||||
|
EXPECT_EQ(dt::WORD, tokens[3].type);
|
||||||
|
|
||||||
text = "{";
|
text = "{";
|
||||||
tokens.clear();
|
tokens.clear();
|
||||||
|
@ -140,3 +141,22 @@ TEST(lagi_word_split, unclosed_ovr) {
|
||||||
EXPECT_EQ(dt::TEXT, tokens[0].type);
|
EXPECT_EQ(dt::TEXT, tokens[0].type);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(lagi_word_split, several_words) {
|
||||||
|
std::string text = "a bb ccc dd e";
|
||||||
|
std::vector<DialogueToken> tokens = {
|
||||||
|
{dt::TEXT, 13},
|
||||||
|
};
|
||||||
|
|
||||||
|
SplitWords(text, tokens);
|
||||||
|
ASSERT_EQ(9u, tokens.size());
|
||||||
|
EXPECT_EQ(1, tokens[0].length);
|
||||||
|
EXPECT_EQ(1, tokens[1].length);
|
||||||
|
EXPECT_EQ(2, tokens[2].length);
|
||||||
|
EXPECT_EQ(1, tokens[3].length);
|
||||||
|
EXPECT_EQ(3, tokens[4].length);
|
||||||
|
EXPECT_EQ(1, tokens[5].length);
|
||||||
|
EXPECT_EQ(2, tokens[6].length);
|
||||||
|
EXPECT_EQ(1, tokens[7].length);
|
||||||
|
EXPECT_EQ(1, tokens[8].length);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue