diff --git a/libaegisub/common/charset_conv.cpp b/libaegisub/common/charset_conv.cpp index 9ed31ad3c..6902ac34d 100644 --- a/libaegisub/common/charset_conv.cpp +++ b/libaegisub/common/charset_conv.cpp @@ -295,16 +295,14 @@ IconvWrapper::IconvWrapper(const char* sourceEncoding, const char* destEncoding, IconvWrapper::~IconvWrapper() { } -std::string IconvWrapper::Convert(std::string const& source) { +std::string IconvWrapper::Convert(const char *source, size_t len) { std::string dest; - Convert(source, dest); + Convert(source, len, dest); return dest; } -void IconvWrapper::Convert(std::string const& source, std::string &dest) { +void IconvWrapper::Convert(const char *src, size_t srcLen, std::string &dest) { char buff[512]; - const char *src = source.data(); - size_t srcLen = source.size(); size_t res; do { char *dst = buff; diff --git a/libaegisub/common/thesaurus.cpp b/libaegisub/common/thesaurus.cpp index 15921fa17..ed1d35255 100644 --- a/libaegisub/common/thesaurus.cpp +++ b/libaegisub/common/thesaurus.cpp @@ -19,10 +19,12 @@ #include "libaegisub/thesaurus.h" #include "libaegisub/charset_conv.h" -#include "libaegisub/io.h" +#include "libaegisub/file_mapping.h" #include "libaegisub/line_iterator.h" +#include "libaegisub/util.h" #include +#include #include #include @@ -33,17 +35,18 @@ using boost::phoenix::placeholders::_1; namespace agi { Thesaurus::Thesaurus(agi::fs::path const& dat_path, agi::fs::path const& idx_path) -: dat(io::Open(dat_path)) +: dat(util::make_unique(dat_path)) { - auto idx = io::Open(idx_path); + read_file_mapping idx_file(idx_path); + boost::interprocess::ibufferstream idx(idx_file.read(), static_cast(idx_file.size())); std::string encoding_name; - getline(*idx, encoding_name); + getline(idx, encoding_name); std::string unused_entry_count; - getline(*idx, unused_entry_count); + getline(idx, unused_entry_count); // Read the list of words and file offsets for those words - for (auto const& line : line_iterator(*idx, encoding_name)) { + for (auto const& line : line_iterator(idx, encoding_name)) { std::vector chunks; boost::split(chunks, line, _1 == '|'); if (chunks.size() == 2) @@ -61,25 +64,33 @@ std::vector Thesaurus::Lookup(std::string const& word) { auto it = offsets.find(word); if (it == offsets.end()) return out; + if (it->second >= dat->size()) return out; - dat->seekg(it->second, std::ios::beg); - if (!dat->good()) return out; + auto len = dat->size() - it->second; + auto buff = dat->read(it->second, len); + auto buff_end = buff + len; + + std::string temp; + auto read_line = [&] () -> std::string const& { + auto start = buff; + auto end = std::find(buff, buff_end, '\n'); + buff = end < buff_end ? end + 1 : buff_end; + if (end > start && end[-1] == '\r') --end; + temp.clear(); + conv->Convert(start, end - start, temp); + return temp; + }; // First line is the word and meaning count - std::string temp; - getline(*dat, temp); std::vector header; - std::string converted(conv->Convert(temp)); - boost::split(header, converted, _1 == '|'); + boost::split(header, read_line(), _1 == '|'); if (header.size() != 2) return out; int meanings = atoi(header[1].c_str()); out.reserve(meanings); for (int i = 0; i < meanings; ++i) { - getline(*dat, temp); - auto converted = conv->Convert(temp); std::vector line; - boost::split(line, converted, _1 == '|'); + boost::split(line, read_line(), _1 == '|'); if (line.size() < 2) continue; diff --git a/libaegisub/include/libaegisub/charset_conv.h b/libaegisub/include/libaegisub/charset_conv.h index fb7048ed2..6de377652 100644 --- a/libaegisub/include/libaegisub/charset_conv.h +++ b/libaegisub/include/libaegisub/charset_conv.h @@ -65,11 +65,13 @@ public: /// @return Converted string. Note that std::string always uses a single byte /// terminator, so c_str() may not return a valid string if the dest /// charset has wider terminators - std::string Convert(std::string const& source); + std::string Convert(std::string const& source) { return Convert(source.c_str(), source.size()); } + std::string Convert(const char *source, size_t len); /// @brief Convert a string from the source to destination charset /// @param source String to convert /// @param[out] dest String to place the result in - void Convert(std::string const& source, std::string &dest); + void Convert(std::string const& source, std::string &dest) { Convert(source.c_str(), source.size(), dest); } + void Convert(const char *source, size_t len, std::string &dest); size_t Convert(const char* source, size_t sourceSize, char* dest, size_t destSize); /// Bare wrapper around iconv; see iconv documention for details size_t Convert(const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft); diff --git a/libaegisub/include/libaegisub/thesaurus.h b/libaegisub/include/libaegisub/thesaurus.h index 5e889a2be..48cf0f653 100644 --- a/libaegisub/include/libaegisub/thesaurus.h +++ b/libaegisub/include/libaegisub/thesaurus.h @@ -26,13 +26,14 @@ namespace agi { +class read_file_mapping; namespace charset { class IconvWrapper; } class Thesaurus { /// Map of word -> byte position in the data file boost::container::flat_map offsets; /// Read handle to the data file - std::unique_ptr dat; + std::unique_ptr dat; /// Converter from the data file's charset to UTF-8 std::unique_ptr conv;