Aegisub/libaegisub/common/karaoke_matcher.cpp

210 lines
7.1 KiB
C++

// Copyright (c) 2013, Thomas Goyne <plorkyeran@aegisub.org>
//
// Permission to use, copy, modify, and distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
//
// Aegisub Project http://www.aegisub.org/
#include "../config.h"
#include "libaegisub/karaoke_matcher.h"
#include "libaegisub/kana_table.h"
#include "libaegisub/util.h"
#include <boost/algorithm/string/case_conv.hpp>
#include <boost/algorithm/string/predicate.hpp>
#include <boost/locale/boundary.hpp>
#include <boost/locale/collator.hpp>
#include <boost/range/algorithm/copy.hpp>
#include <unicode/uchar.h>
#include <unicode/utf8.h>
namespace {
int32_t next_codepoint(const char *str, size_t *i) {
UChar32 c;
U8_NEXT_UNSAFE(str, *i, c);
return c;
}
bool is_whitespace(int32_t c) {
return !!u_isUWhiteSpace(c);
}
bool is_whitespace(std::string const& str) {
size_t i = 0;
while (auto c = next_codepoint(str.c_str(), &i)) {
if (!u_isUWhiteSpace(c))
return false;
}
return true;
}
// strcmp but ignoring case and accents
int compare(std::string const& a, std::string const& b) {
using namespace boost::locale;
return std::use_facet<collator<char>>(std::locale()).compare(collator_base::primary, a, b);
}
}
namespace agi {
karaoke_match_result auto_match_karaoke(std::vector<std::string> const& source_strings, std::string const& dest_string) {
karaoke_match_result result = { 0, 0 };
if (source_strings.empty()) return result;
using namespace boost::locale::boundary;
using boost::starts_with;
result.source_length = 1;
ssegment_index destination_characters(character, begin(dest_string), end(dest_string));
auto src = boost::to_lower_copy(source_strings[0]);
auto dst = destination_characters.begin();
auto dst_end = destination_characters.end();
// Eat all the whitespace at the beginning of the source and destination
// syllables and exit if either ran out.
auto eat_whitespace = [&]() -> bool {
size_t i = 0, first_non_whitespace = 0;
while (is_whitespace(next_codepoint(src.c_str(), &i)))
first_non_whitespace = i;
if (first_non_whitespace)
src = src.substr(first_non_whitespace);
while (dst != dst_end && is_whitespace(dst->str())) {
++dst;
++result.destination_length;
}
// If we ran out of dest then this needs to match the rest of the
// source syllables (this probably means the user did something wrong)
if (dst == dst_end) {
result.source_length = source_strings.size();
return true;
}
return src.empty();
};
if (eat_whitespace()) return result;
// We now have a non-whitespace character at the beginning of both source
// and destination. Check if the source starts with a romanized kana, and
// if it does then check if the destination also has the appropriate
// character. If it does, match them and repeat.
while (!src.empty()) {
// First check for a basic match of the first character of the source and dest
auto first_src_char = ssegment_index(character, begin(src), end(src)).begin()->str();
if (compare(first_src_char, dst->str()) == 0) {
++dst;
++result.destination_length;
src.erase(0, first_src_char.size());
if (eat_whitespace()) return result;
continue;
}
auto check = [&](kana_pair const& kp) -> bool {
if (!starts_with(&*dst->begin(), kp.kana)) return false;
src = src.substr(strlen(kp.romaji));
for (size_t i = 0; kp.kana[i]; ) {
i += dst->length();
++result.destination_length;
++dst;
}
return true;
};
bool matched = false;
for (auto const& match : romaji_to_kana(src)) {
if (check(match)) {
if (eat_whitespace()) return result;
matched = true;
break;
}
}
if (!matched) break;
}
// Source and dest are now non-empty and start with non-whitespace.
// If there's only one character left in the dest, it obviously needs to
// match all of the source syllables left.
if (std::distance(dst, dst_end) == 1) {
result.source_length = source_strings.size();
++result.destination_length;
return result;
}
// We couldn't match the current character, but if we can match the *next*
// syllable then we know that everything in between must belong to the
// current syllable. Do this by looking up to KANA_SEARCH_DISTANCE
// characters ahead in destination and seeing if we can match them against
// the beginning of a syllable after this syllable.
// If a match is found, make a guess at how much source and destination
// should be selected based on the distances it was found at.
// The longest kanji are 'uketamawa.ru' and 'kokorozashi', each with a
// reading consisting of five kana. This means each each character from
// the destination can match at most five syllables from the source.
static const int max_character_length = 5;
// Arbitrarily chosen limit on the number of dest characters to try
// skipping. Higher numbers probably increase false-positives.
static const int dst_lookahead_max = 3;
for (size_t lookahead = 0; lookahead < dst_lookahead_max; ++lookahead) {
if (++dst == dst_end) break;
// Transliterate this character if it's a known hiragana or katakana character
std::vector<const char *> translit;
auto next = std::next(dst);
if (next != dst_end)
boost::copy(kana_to_romaji(dst->str() + next->str()), back_inserter(translit));
boost::copy(kana_to_romaji(dst->str()), back_inserter(translit));
// Search for it and the transliterated version in the source
int src_lookahead_max = (lookahead + 1) * max_character_length;
int src_lookahead_pos = 0;
for (auto const& syl : source_strings) {
// Don't count blank syllables in the max search distance
if (is_whitespace(syl)) continue;
if (++src_lookahead_pos == 1) continue;
if (src_lookahead_pos > src_lookahead_max) break;
std::string lsyl = boost::to_lower_copy(syl);
if (!(starts_with(syl, dst->str()) || util::any_of(translit, [&](const char *str) { return starts_with(lsyl, str); })))
continue;
// The syllable immediately after the current one matched, so
// everything up to the match must go with the current syllable.
if (src_lookahead_pos == 2) {
result.destination_length += lookahead + 1;
return result;
}
// The match was multiple syllables ahead, so just divide the
// destination characters evenly between the source syllables
result.destination_length += 1;
result.source_length = static_cast<size_t>((src_lookahead_pos - 1.0) / (lookahead + 1.0) + .5);
return result;
}
}
// We wouldn't have gotten here if the dest was empty, so make sure at
// least one character is selected
result.destination_length = std::max<size_t>(result.destination_length, 1u);
return result;
}
}