From 2de95818dbf3994dccd04a9f77e13cf5f750655e Mon Sep 17 00:00:00 2001 From: Thomas Goyne Date: Fri, 18 Apr 2014 18:36:43 -0700 Subject: [PATCH] Use ICU directly for character counting ICU docs say not to create a new break iterator each time as boost.locale does, and in fact creating the break iterator is about 90% of the run time of the character counter, so use ICU directly and cache the break iterator. --- libaegisub/common/character_count.cpp | 55 +++++++++++++++++++++------ 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/libaegisub/common/character_count.cpp b/libaegisub/common/character_count.cpp index 20ea87510..3d7878e1f 100644 --- a/libaegisub/common/character_count.cpp +++ b/libaegisub/common/character_count.cpp @@ -17,28 +17,59 @@ #include "libaegisub/character_count.h" #include "libaegisub/ass/dialogue_parser.h" +#include "libaegisub/exception.h" -#include -#include #include #include +#include +#include + namespace { +struct utext_deleter { + void operator()(UText *ut) { if (ut) utext_close(ut); } +}; +using utext_ptr = std::unique_ptr; + +template +utext_ptr to_utext(Iterator begin, Iterator end) { + UErrorCode err = U_ZERO_ERROR; + utext_ptr ret(utext_openUTF8(nullptr, &*begin, end - begin, &err)); + if (U_FAILURE(err)) throw agi::InternalError("Failed to open utext", nullptr); + return ret; +} + template size_t count_in_range(Iterator begin, Iterator end, bool ignore_whitespace) { - using namespace boost::locale::boundary; - const ssegment_index characters(character, begin, end); - if (!ignore_whitespace) - return boost::distance(characters); + if (begin == end) return 0; + + static std::unique_ptr character_bi; + static std::once_flag token; + std::call_once(token, [&] { + UErrorCode status = U_ZERO_ERROR; + character_bi.reset(BreakIterator::createCharacterInstance(Locale::getDefault(), status)); + if (U_FAILURE(status)) throw agi::InternalError("Failed to create character iterator", nullptr); + }); + + UErrorCode err = U_ZERO_ERROR; + + utext_ptr ut = to_utext(begin, end); + character_bi->setText(ut.get(), err); + if (U_FAILURE(err)) throw agi::InternalError("Failed to set break iterator text", nullptr); - // characters.rule(word_any) doesn't seem to work for character indexes (everything is word_none) size_t count = 0; - for (auto const& chr : characters) { - UChar32 c; - int i = 0; - U8_NEXT_UNSAFE(chr.begin(), i, c); - if (!u_isUWhiteSpace(c)) + auto pos = character_bi->first(); + for (auto end = character_bi->next(); end != BreakIterator::DONE; pos = end, end = character_bi->next()) { + if (!ignore_whitespace) ++count; + else { + UChar32 c; + int i = 0; + U8_NEXT_UNSAFE(begin + pos, i, c); + if (!u_isUWhiteSpace(c)) + ++count; + } + } return count; }