Use read_file_mapping for the thesaurus

2025-04-11 22:56:02 +02:00 · 2014-03-21 13:12:56 -07:00 · 2014-03-21 13:12:56 -07:00 · d454872c00
commit d454872c00
parent 0268ffd345
4 changed files with 35 additions and 23 deletions
--- a/libaegisub/common/charset_conv.cpp
+++ b/libaegisub/common/charset_conv.cpp
@ -295,16 +295,14 @@ IconvWrapper::IconvWrapper(const char* sourceEncoding, const char* destEncoding,
 IconvWrapper::~IconvWrapper() {
 }

-std::string IconvWrapper::Convert(std::string const& source) {
+std::string IconvWrapper::Convert(const char *source, size_t len) {
 	std::string dest;
-	Convert(source, dest);
+	Convert(source, len, dest);
 	return dest;
 }
-void IconvWrapper::Convert(std::string const& source, std::string &dest) {
+void IconvWrapper::Convert(const char *src, size_t srcLen, std::string &dest) {
 	char buff[512];

-	const char *src = source.data();
-	size_t srcLen = source.size();
 	size_t res;
 	do {
 		char *dst = buff;
--- a/libaegisub/common/thesaurus.cpp
+++ b/libaegisub/common/thesaurus.cpp
@ -19,10 +19,12 @@
 #include "libaegisub/thesaurus.h"

 #include "libaegisub/charset_conv.h"
-#include "libaegisub/io.h"
+#include "libaegisub/file_mapping.h"
 #include "libaegisub/line_iterator.h"
+#include "libaegisub/util.h"

 #include <boost/algorithm/string.hpp>
+#include <boost/interprocess/streams/bufferstream.hpp>
 #include <boost/phoenix/operator/comparison.hpp>
 #include <boost/phoenix/core/argument.hpp>

@ -33,17 +35,18 @@ using boost::phoenix::placeholders::_1;
 namespace agi {

 Thesaurus::Thesaurus(agi::fs::path const& dat_path, agi::fs::path const& idx_path)
-: dat(io::Open(dat_path))
+: dat(util::make_unique<read_file_mapping>(dat_path))
 {
-	auto idx = io::Open(idx_path);
+	read_file_mapping idx_file(idx_path);
+	boost::interprocess::ibufferstream idx(idx_file.read(), static_cast<size_t>(idx_file.size()));

 	std::string encoding_name;
-	getline(*idx, encoding_name);
+	getline(idx, encoding_name);
 	std::string unused_entry_count;
-	getline(*idx, unused_entry_count);
+	getline(idx, unused_entry_count);

 	// Read the list of words and file offsets for those words
-	for (auto const& line : line_iterator<std::string>(*idx, encoding_name)) {
+	for (auto const& line : line_iterator<std::string>(idx, encoding_name)) {
 		std::vector<std::string> chunks;
 		boost::split(chunks, line, _1 == '|');
 		if (chunks.size() == 2)
@ -61,25 +64,33 @@ std::vector<Thesaurus::Entry> Thesaurus::Lookup(std::string const& word) {

 	auto it = offsets.find(word);
 	if (it == offsets.end()) return out;
+	if (it->second >= dat->size()) return out;

-	dat->seekg(it->second, std::ios::beg);
-	if (!dat->good()) return out;
+	auto len = dat->size() - it->second;
+	auto buff = dat->read(it->second, len);
+	auto buff_end = buff + len;
+
+	std::string temp;
+	auto read_line = [&] () -> std::string const& {
+		auto start = buff;
+		auto end = std::find(buff, buff_end, '\n');
+		buff = end < buff_end ? end + 1 : buff_end;
+		if (end > start && end[-1] == '\r') --end;
+		temp.clear();
+		conv->Convert(start, end - start, temp);
+		return temp;
+	};

 	// First line is the word and meaning count
-	std::string temp;
-	getline(*dat, temp);
 	std::vector<std::string> header;
-	std::string converted(conv->Convert(temp));
-	boost::split(header, converted, _1 == '|');
+	boost::split(header, read_line(), _1 == '|');
 	if (header.size() != 2) return out;
 	int meanings = atoi(header[1].c_str());

 	out.reserve(meanings);
 	for (int i = 0; i < meanings; ++i) {
-		getline(*dat, temp);
-		auto converted = conv->Convert(temp);
 		std::vector<std::string> line;
-		boost::split(line, converted, _1 == '|');
+		boost::split(line, read_line(), _1 == '|');

 		if (line.size() < 2)
 			continue;
--- a/libaegisub/include/libaegisub/charset_conv.h
+++ b/libaegisub/include/libaegisub/charset_conv.h
@ -65,11 +65,13 @@ public:
 	/// @return Converted string. Note that std::string always uses a single byte
 	///         terminator, so c_str() may not return a valid string if the dest
 	///         charset has wider terminators
-	std::string Convert(std::string const& source);
+	std::string Convert(std::string const& source) { return Convert(source.c_str(), source.size()); }
+	std::string Convert(const char *source, size_t len);
 	/// @brief Convert a string from the source to destination charset
 	/// @param source String to convert
 	/// @param[out] dest String to place the result in
-	void Convert(std::string const& source, std::string &dest);
+	void Convert(std::string const& source, std::string &dest) { Convert(source.c_str(), source.size(), dest); }
+	void Convert(const char *source, size_t len, std::string &dest);
 	size_t Convert(const char* source, size_t sourceSize, char* dest, size_t destSize);
 	/// Bare wrapper around iconv; see iconv documention for details
 	size_t Convert(const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft);
--- a/libaegisub/include/libaegisub/thesaurus.h
+++ b/libaegisub/include/libaegisub/thesaurus.h
@ -26,13 +26,14 @@

 namespace agi {

+class read_file_mapping;
 namespace charset { class IconvWrapper; }

 class Thesaurus {
 	/// Map of word -> byte position in the data file
 	boost::container::flat_map<std::string, int> offsets;
 	/// Read handle to the data file
-	std::unique_ptr<std::istream> dat;
+	std::unique_ptr<read_file_mapping> dat;
 	/// Converter from the data file's charset to UTF-8
 	std::unique_ptr<charset::IconvWrapper> conv;