diff --git a/aegisub/Makefile.inc.in b/aegisub/Makefile.inc.in
index 68b5fa254..a5d90b69f 100644
--- a/aegisub/Makefile.inc.in
+++ b/aegisub/Makefile.inc.in
@@ -76,7 +76,7 @@ CPPFLAGS_WX = @WX_CPPFLAGS@
LIBS_WX = @WX_LIBS@ -lz
CPPFLAGS_BOOST = @BOOST_CPPFLAGS@
-LIBS_BOOST = @BOOST_FILESYSTEM_LDFLAGS@ @BOOST_FILESYSTEM_LIBS@ @BOOST_REGEX_LDFLAGS@ @BOOST_REGEX_LIBS@ @BOOST_SYSTEM_LDFLAGS@ @BOOST_SYSTEM_LIBS@
+LIBS_BOOST = @BOOST_FILESYSTEM_LDFLAGS@ @BOOST_FILESYSTEM_LIBS@ @BOOST_LOCALE_LIBS@ @BOOST_REGEX_LIBS@ @BOOST_SYSTEM_LIBS@
CFLAGS_FFMS2 = @FFMS2_CFLAGS@
CFLAGS_FFTW3 = @FFTW3_CFLAGS@
diff --git a/aegisub/build/tests/tests.vcxproj b/aegisub/build/tests/tests.vcxproj
index 71d734c5f..ff4b5fdff 100644
--- a/aegisub/build/tests/tests.vcxproj
+++ b/aegisub/build/tests/tests.vcxproj
@@ -44,6 +44,7 @@
+
diff --git a/aegisub/build/tests/tests.vcxproj.filters b/aegisub/build/tests/tests.vcxproj.filters
index 75aa7e626..ad542727b 100644
--- a/aegisub/build/tests/tests.vcxproj.filters
+++ b/aegisub/build/tests/tests.vcxproj.filters
@@ -35,6 +35,9 @@
Tests
+
+ Tests
+
Tests
diff --git a/aegisub/configure.ac b/aegisub/configure.ac
index 1ee33c2c1..95531541b 100644
--- a/aegisub/configure.ac
+++ b/aegisub/configure.ac
@@ -248,6 +248,7 @@ PKG_CHECK_MODULES(FONTCONFIG, fontconfig >= fontconfig_required_version,
libext=a
BOOST_REQUIRE([boost_required_version])
BOOST_FILESYSTEM
+BOOST_LOCALE
BOOST_REGEX
########
diff --git a/aegisub/libaegisub/common/util.cpp b/aegisub/libaegisub/common/util.cpp
index b1b738e23..7c2191f6b 100644
--- a/aegisub/libaegisub/common/util.cpp
+++ b/aegisub/libaegisub/common/util.cpp
@@ -18,8 +18,45 @@
#include "libaegisub/util.h"
+#include "libaegisub/exception.h"
+
+#include
+#include
+#include
#include
+namespace {
+const size_t bad_pos = (size_t)-1;
+const std::pair bad_match(bad_pos, bad_pos);
+
+template
+size_t advance_both(Iterator& folded, Iterator& raw) {
+ size_t len;
+ if (*folded == *raw) {
+ len = folded->length();
+ ++folded;
+ }
+ else {
+ // This character was changed by case folding, so refold it and eat the
+ // appropriate number of characters from folded
+ len = boost::locale::fold_case(raw->str()).size();
+ for (size_t folded_consumed = 0; folded_consumed < len; ++folded)
+ folded_consumed += folded->length();
+ }
+
+ ++raw;
+ return len;
+}
+
+std::pair find_range(std::string const& haystack, std::string const& needle, size_t start = 0) {
+ const size_t match_start = haystack.find(needle, start);
+ if (match_start == std::string::npos)
+ return bad_match;
+ return std::make_pair(match_start, match_start + needle.size());
+}
+
+}
+
namespace agi { namespace util {
std::string strftime(const char *fmt, const tm *tmptr) {
@@ -33,4 +70,66 @@ std::string strftime(const char *fmt, const tm *tmptr) {
return buff;
}
+std::pair ifind(std::string const& haystack, std::string const& needle) {
+ const auto folded_hs = boost::locale::fold_case(haystack);
+ const auto folded_n = boost::locale::fold_case(needle);
+ auto match = find_range(folded_hs, folded_n);
+ if (match == bad_match || folded_hs == haystack)
+ return match;
+
+ // We have a match, but the position is an index into the folded string
+ // and we want an index into the unfolded string.
+
+ using namespace boost::locale::boundary;
+ const ssegment_index haystack_characters(character, begin(haystack), end(haystack));
+ const ssegment_index folded_characters(character, begin(folded_hs), end(folded_hs));
+ const size_t haystack_char_count = boost::distance(haystack_characters);
+ const size_t folded_char_count = boost::distance(folded_characters);
+
+ // As of Unicode 6.2, case folding can never reduce the number of
+ // characters, and can only reduce the number of bytes with UTF-8 when
+ // increasing the number of characters. As a result, iff the bytes and
+ // characters are unchanged, no folds changed the size of any characters
+ // and our indices are correct.
+ if (haystack.size() == folded_hs.size() && haystack_char_count == folded_char_count)
+ return match;
+
+ const auto map_folded_to_raw = [&]() -> std::pair {
+ size_t start = -1;
+
+ // Iterate over each pair of characters and refold each character which was
+ // changed by folding, so that we can find the corresponding positions in
+ // the unfolded string
+ auto folded_it = begin(folded_characters);
+ auto haystack_it = begin(haystack_characters);
+ size_t folded_pos = 0;
+
+ while (folded_pos < match.first)
+ folded_pos += advance_both(folded_it, haystack_it);
+ // If we overshot the start then the match started in the middle of a
+ // character which was folded to multiple characters
+ if (folded_pos > match.first)
+ return bad_match;
+
+ start = distance(begin(haystack), begin(*haystack_it));
+
+ while (folded_pos < match.second)
+ folded_pos += advance_both(folded_it, haystack_it);
+ if (folded_pos > match.second)
+ return bad_match;
+
+ return std::make_pair(start, distance(begin(haystack), begin(*haystack_it)));
+ };
+
+ auto ret = map_folded_to_raw();
+ while (ret == bad_match) {
+ // Found something, but it was an invalid match so retry from the next character
+ match = find_range(folded_hs, folded_n, match.first + 1);
+ if (match == bad_match) return match;
+ ret = map_folded_to_raw();
+ }
+
+ return ret;
+}
+
} }
diff --git a/aegisub/libaegisub/include/libaegisub/util.h b/aegisub/libaegisub/include/libaegisub/util.h
index 0a637c01f..95cbac266 100644
--- a/aegisub/libaegisub/include/libaegisub/util.h
+++ b/aegisub/libaegisub/include/libaegisub/util.h
@@ -41,6 +41,16 @@ namespace agi {
/// @return The strftime-formatted string
std::string strftime(const char *fmt, const tm *tmptr = nullptr);
+ /// Case-insensitive find with proper case folding
+ /// @param haystack String to search
+ /// @param needle String to look for
+ /// @return make_pair(-1,-1) if `needle` could not be found, or a range equivalent to `needle` in `haystack` if it could
+ ///
+ /// `needle` and `haystack` must both be in Normalization Form D. The size
+ /// of the match might be different from the size of `needle`, since it's
+ /// based on the unfolded length.
+ std::pair ifind(std::string const& haystack, std::string const& needle);
+
struct delete_ptr {
template
void operator()(T* ptr) const {
diff --git a/aegisub/src/dialog_search_replace.cpp b/aegisub/src/dialog_search_replace.cpp
index a8d932f9c..4573af8c7 100644
--- a/aegisub/src/dialog_search_replace.cpp
+++ b/aegisub/src/dialog_search_replace.cpp
@@ -36,6 +36,7 @@
#include
#include
#include
+#include
#include
#include
#include
diff --git a/aegisub/src/search_replace_engine.cpp b/aegisub/src/search_replace_engine.cpp
index 105f14ef0..aa9f4c1de 100644
--- a/aegisub/src/search_replace_engine.cpp
+++ b/aegisub/src/search_replace_engine.cpp
@@ -25,8 +25,9 @@
#include "text_selection_controller.h"
#include
+#include
-#include
+#include
#include
@@ -43,6 +44,14 @@ auto get_dialogue_field(SearchReplaceSettings::Field field) -> decltype(&AssDial
throw agi::InternalError("Bad field for search", 0);
}
+std::string const& get_normalized(const AssDialogue *diag, decltype(&AssDialogue::Text) field) {
+ auto& value = const_cast(diag)->*field;
+ auto normalized = boost::locale::normalize(value.get());
+ if (normalized != value)
+ value = normalized;
+ return value.get();
+}
+
typedef std::function matcher;
class noop_accessor {
@@ -54,7 +63,7 @@ public:
std::string get(const AssDialogue *d, size_t s) {
start = s;
- return (d->*field).get().substr(s);
+ return get_normalized(d, field).substr(s);
}
MatchState make_match_state(size_t s, size_t e, boost::u32regex *r = nullptr) {
@@ -87,7 +96,7 @@ public:
skip_tags_accessor(SearchReplaceSettings::Field f) : field(get_dialogue_field(f)), start(0) { }
std::string get(const AssDialogue *d, size_t s) {
- auto const& str = (d->*field).get();
+ auto const& str = get_normalized(d, field);
parse_str(str);
std::string out;
@@ -156,25 +165,25 @@ matcher get_matcher(SearchReplaceSettings const& settings, Accessor&& a) {
};
}
- bool full_match_only = settings.exact_match;
- bool match_case = settings.match_case;
+ const bool full_match_only = settings.exact_match;
+ const bool match_case = settings.match_case;
std::string look_for = settings.find;
+
if (!settings.match_case)
- boost::to_lower(look_for);
+ look_for = boost::locale::fold_case(look_for);
return [=](const AssDialogue *diag, size_t start) mutable -> MatchState {
- auto str = a.get(diag, start);
+ const auto str = a.get(diag, start);
if (full_match_only && str.size() != look_for.size())
return MatchState();
- if (!match_case)
- boost::to_lower(str);
+ if (match_case) {
+ const auto pos = str.find(look_for);
+ return pos == std::string::npos ? MatchState() : a.make_match_state(pos, pos + look_for.size());
+ }
- size_t pos = str.find(look_for);
- if (pos == std::string::npos)
- return MatchState();
-
- return a.make_match_state(pos, pos + look_for.size());
+ const auto pos = agi::util::ifind(str, look_for);
+ return pos.first == bad_pos ? MatchState() : a.make_match_state(pos.first, pos.second);
};
}
diff --git a/aegisub/tests/Makefile b/aegisub/tests/Makefile
index 076a21d09..8d82b08c8 100644
--- a/aegisub/tests/Makefile
+++ b/aegisub/tests/Makefile
@@ -23,6 +23,7 @@ SRC = \
tests/dialogue_lexer.cpp \
tests/hotkey.cpp \
tests/iconv.cpp \
+ tests/ifind.cpp \
tests/keyframe.cpp \
tests/line_iterator.cpp \
tests/line_wrap.cpp \
diff --git a/aegisub/tests/support/main.cpp b/aegisub/tests/support/main.cpp
index 87c32428f..3f0808246 100644
--- a/aegisub/tests/support/main.cpp
+++ b/aegisub/tests/support/main.cpp
@@ -18,8 +18,11 @@
#include
#include
+#include
+
int main(int argc, char **argv) {
agi::dispatch::Init([](agi::dispatch::Thunk f) { });
+ std::locale::global(boost::locale::generator().generate(""));
int retval;
agi::log::log = new agi::log::LogSink;
diff --git a/aegisub/tests/tests/ifind.cpp b/aegisub/tests/tests/ifind.cpp
new file mode 100644
index 000000000..baad59913
--- /dev/null
+++ b/aegisub/tests/tests/ifind.cpp
@@ -0,0 +1,75 @@
+// Copyright (c) 2013, Thomas Goyne
+//
+// Permission to use, copy, modify, and distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+//
+// Aegisub Project http://www.aegisub.org/
+
+#include
+
+#include "main.h"
+
+#define IFIND(haystack, needle) \
+ std::pair pos; \
+ ASSERT_NO_THROW(pos = agi::util::ifind(haystack, needle))
+
+#define EXPECT_IFIND(haystack, needle, s, e) \
+ do { \
+ IFIND(haystack, needle); \
+ EXPECT_EQ((size_t)s, pos.first); \
+ EXPECT_EQ((size_t)e, pos.second); \
+ } while(false)
+
+#define EXPECT_NO_MATCH(haystack, needle) \
+ do { \
+ IFIND(haystack, needle); \
+ EXPECT_EQ((size_t)-1, pos.first); \
+ EXPECT_EQ((size_t)-1, pos.second); \
+ } while(false)
+
+TEST(lagi_ifind, basic_match) {
+ EXPECT_IFIND(" a ", "a", 1, 2);
+ EXPECT_IFIND(" a ", "A", 1, 2);
+ EXPECT_NO_MATCH(" a ", "b");
+}
+
+TEST(lagi_ifind, sharp_s_matches_ss) {
+ // lowercase
+ EXPECT_IFIND(" \xC3\x9F ", "ss", 1, 3);
+ EXPECT_IFIND(" ss ", "\xC3\x9F", 1, 3);
+
+ // uppercase
+ EXPECT_IFIND(" \xE1\xBA\x9E ", "ss", 1, 4);
+ EXPECT_IFIND(" ss ", "\xE1\xBA\x9E", 1, 3);
+}
+
+TEST(lagi_ifind, no_partial_match_on_decomposed_character) {
+ EXPECT_NO_MATCH("s\xEF\xAC\x86", "ss"); // LATIN SMALL LIGATURE ST
+ EXPECT_NO_MATCH("\xEF\xAC\x86t", "tt");
+ EXPECT_NO_MATCH(" \xE1\xBA\x9E ", "s");
+ EXPECT_NO_MATCH("\xE1\xBA\x9E", "s");
+ EXPECT_IFIND(" \xE1\xBA\x9E s ", "s", 5, 6);
+ EXPECT_IFIND("s\xE1\xBA\x9E", "ss", 1, 4);
+ EXPECT_IFIND("s\xE1\xBA\x9E", "sss", 0, 4);
+ EXPECT_IFIND("\xE1\xBA\x9Es", "sss", 0, 4);
+ EXPECT_IFIND("\xEF\xAC\x86", "st", 0, 3);
+}
+
+TEST(lagi_ifind, correct_index_with_expanded_character_before_match) {
+ // U+0587 turns into U+0565 U+0582, all of which are two bytes in UTF-8
+ EXPECT_IFIND(" \xD6\x87 a ", "a", 4, 5);
+}
+
+TEST(lagi_ifind, correct_index_with_shrunk_character_before_match) {
+ // U+FB00 turns into "ff", which is one byte shorter in UTF-8
+ EXPECT_IFIND(" \xEF\xAC\x80 a ", "a", 5, 6);
+}