optimize resolve_duplicate_filenames_slow()

2019-04-05 01:27:46 +02:00 · 2019-04-05 01:27:46 +02:00 · 4441655bab
parent 212ea3db27
commit 4441655bab
6 changed files with 100 additions and 85 deletions
--- a/1
+++ b/1
@ -1,3 +1,4 @@
+	* optimize resolving duplicate filenames in loading torrent files
 	* fix python binding of dht_settings
 	* tighten up various input validation checks
 	* fix create_torrent python binding
--- a/include/libtorrent/string_util.hpp
+++ b/include/libtorrent/string_util.hpp
@ -121,15 +121,6 @@ namespace libtorrent {
 	TORRENT_EXTRA_EXPORT bool is_i2p_url(std::string const& url);

 #endif
-
-	// this can be used as the hash function in std::unordered_*
-	struct TORRENT_EXTRA_EXPORT string_hash_no_case
-	{ size_t operator()(std::string const& s) const; };
-
-	// these can be used as the comparison functions in std::map and std::set
-	struct TORRENT_EXTRA_EXPORT string_eq_no_case
-	{ bool operator()(std::string const& lhs, std::string const& rhs) const; };
-
 }

 #endif
--- a/src/string_util.cpp
+++ b/src/string_util.cpp
@ -379,28 +379,4 @@ namespace libtorrent {
 	}

 #endif
-
-	std::size_t string_hash_no_case::operator()(std::string const& s) const
-	{
-		std::size_t ret = 5381;
-		for (auto const c : s)
-			ret = (ret * 33) ^ static_cast<std::size_t>(to_lower(c));
-		return ret;
-	}
-
-	bool string_eq_no_case::operator()(std::string const& lhs, std::string const& rhs) const
-	{
-		if (lhs.size() != rhs.size()) return false;
-
-		auto s1 = lhs.cbegin();
-		auto s2 = rhs.cbegin();
-
-		while (s1 != lhs.end() && s2 != rhs.end())
-		{
-			if (to_lower(*s1) != to_lower(*s2)) return false;
-			++s1;
-			++s2;
-		}
-		return true;
-	}
 }
--- a/src/torrent_info.cpp
+++ b/src/torrent_info.cpp
@ -59,6 +59,10 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "libtorrent/lazy_entry.hpp"
 #endif

+#include "libtorrent/aux_/disable_warnings_push.hpp"
+#include <boost/crc.hpp>
+#include "libtorrent/aux_/disable_warnings_pop.hpp"
+
 #include <unordered_map>
 #include <unordered_set>
 #include <cstdint>
@ -640,58 +644,117 @@ namespace {
 		}
 	}

+namespace {
+
+	template <class CRC>
+	void process_string_lowercase(CRC& crc, string_view str)
+	{
+		for (char const c : str)
+			crc.process_byte(to_lower(c) & 0xff);
+	}
+
+	struct name_entry
+	{
+		file_index_t idx;
+		int length;
+	};
+}
+
 	void torrent_info::resolve_duplicate_filenames_slow()
 	{
 		INVARIANT_CHECK;

-		std::unordered_map<std::string, file_index_t, string_hash_no_case, string_eq_no_case> files;
+		// maps filename hash to file index
+		// or, if the file_index is negative, maps into the paths vector
+		std::unordered_multimap<std::uint32_t, name_entry> files;

 		std::vector<std::string> const& paths = m_files.paths();
 		files.reserve(paths.size() + aux::numeric_cast<std::size_t>(m_files.num_files()));

 		// insert all directories first, to make sure no files
 		// are allowed to collied with them
-		for (auto const& i : paths)
 		{
-			std::string p = combine_path(m_files.name(), i);
-			files.insert({p, file_index_t{-1}});
-			while (has_parent_path(p))
+			boost::crc_optimal<32, 0x1EDC6F41, 0xFFFFFFFF, 0xFFFFFFFF, true, true> crc;
+			if (!m_files.name().empty())
 			{
-				p = parent_path(std::move(p));
-				// we don't want trailing slashes here
-				TORRENT_ASSERT(p[p.size() - 1] == TORRENT_SEPARATOR);
-				p.resize(p.size() - 1);
-				files.insert({p, file_index_t{-1}});
+				process_string_lowercase(crc, m_files.name());
+			}
+			file_index_t path_index{-1};
+			for (auto const& path : paths)
+			{
+				auto local_crc = crc;
+				if (!path.empty()) local_crc.process_byte(TORRENT_SEPARATOR);
+				int count = 0;
+				for (char const c : path)
+				{
+					if (c == TORRENT_SEPARATOR)
+						files.insert({local_crc.checksum(), {path_index, count}});
+					local_crc.process_byte(to_lower(c) & 0xff);
+					++count;
+				}
+				files.insert({local_crc.checksum(), {path_index, int(path.size())}});
+				--path_index;
 			}
 		}

+		// keep track of the total number of name collisions. If there are too
+		// many, it's probably a malicious torrent and we should just fail
+		int num_collisions = 0;
 		for (auto const i : m_files.file_range())
 		{
 			// as long as this file already exists
 			// increase the counter
-			std::string filename = m_files.file_path(i);
-			auto const ret = files.insert({filename, i});
-			if (ret.second) continue;
+			std::uint32_t const hash = m_files.file_path_hash(i, "");
+			auto range = files.equal_range(hash);
+			auto const match = std::find_if(range.first, range.second, [&](std::pair<std::uint32_t, name_entry> const& o)
+			{
+				std::string const other_name = o.second.idx < file_index_t{}
+					? combine_path(m_files.name(), paths[std::size_t(-static_cast<int>(o.second.idx)-1)].substr(0, std::size_t(o.second.length)))
+					: m_files.file_path(o.second.idx);
+				return string_equal_no_case(other_name, m_files.file_path(i));
+			});
+
+			if (match == range.second)
+			{
+				files.insert({hash, {i, 0}});
+				continue;
+			}
+
 			// pad files are allowed to collide with each-other, as long as they have
 			// the same size.
-			file_index_t const other_idx = ret.first->second;
-			if (other_idx != file_index_t{-1}
+			file_index_t const other_idx = match->second.idx;
+			if (other_idx >= file_index_t{}
 				&& (m_files.file_flags(i) & file_storage::flag_pad_file)
 				&& (m_files.file_flags(other_idx) & file_storage::flag_pad_file)
 				&& m_files.file_size(i) == m_files.file_size(other_idx))
 				continue;

+			std::string filename = m_files.file_path(i);
 			std::string base = remove_extension(filename);
 			std::string ext = extension(filename);
 			int cnt = 0;
-			do
+			for (;;)
 			{
 				++cnt;
 				char new_ext[50];
 				std::snprintf(new_ext, sizeof(new_ext), ".%d%s", cnt, ext.c_str());
 				filename = base + new_ext;
+
+				boost::crc_optimal<32, 0x1EDC6F41, 0xFFFFFFFF, 0xFFFFFFFF, true, true> crc;
+				process_string_lowercase(crc, filename);
+				std::uint32_t const new_hash = crc.checksum();
+				if (files.find(new_hash) == files.end())
+				{
+					files.insert({new_hash, {i, 0}});
+					break;
+				}
+				++num_collisions;
+				if (num_collisions > 100)
+				{
+				// TODO: this should be considered a failure, and the .torrent file
+				// rejected
+				}
 			}
-			while (!files.insert({filename, i}).second);

 			copy_on_write();
 			m_files.rename_file(i, filename);
--- a/test/test_string.cpp
+++ b/test/test_string.cpp
@ -455,41 +455,6 @@ TORRENT_TEST(i2p_url)
 }
 #endif

-TORRENT_TEST(string_hash_no_case)
-{
-	string_hash_no_case hsh;
-
-	// make sure different strings yield different hashes
-	TEST_CHECK(hsh("ab") != hsh("ba"));
-
-	// make sure case is ignored
-	TEST_EQUAL(hsh("Ab"), hsh("ab"));
-	TEST_EQUAL(hsh("Ab"), hsh("aB"));
-
-	// make sure zeroes in strings are supported
-	TEST_CHECK(hsh(std::string("\0a", 2)) != hsh(std::string("\0b", 2)));
-	TEST_EQUAL(hsh(std::string("\0a", 2)), hsh(std::string("\0a", 2)));
-}
-
-TORRENT_TEST(string_eq_no_case)
-{
-	string_eq_no_case cmp;
-	TEST_CHECK(cmp("ab", "ba") == false);
-	TEST_CHECK(cmp("", ""));
-	TEST_CHECK(cmp("abc", "abc"));
-
-	// make sure different lengths are correctly treated as different
-	TEST_CHECK(cmp("abc", "ab") == false);
-
-	// make sure case is ignored
-	TEST_CHECK(cmp("Ab", "ab"));
-	TEST_CHECK(cmp("Ab", "aB"));
-
-	// make sure zeros are supported
-	TEST_CHECK(cmp(std::string("\0a", 2), std::string("\0b", 2)) == false);
-	TEST_CHECK(cmp(std::string("\0a", 2), std::string("\0a", 2)));
-}
-
 TORRENT_TEST(string_ptr_zero_termination)
 {
 	char str[] = {'f', 'o', 'o', 'b', 'a', 'r'};
--- a/test/test_torrent_info.cpp
+++ b/test/test_torrent_info.cpp
@ -909,6 +909,25 @@ std::vector<lt::aux::vector<file_t, lt::file_index_t>> const test_cases
 		{"test/B.exe", 0x4000, {}, "test/B.2.exe"},
 		{"test/filler", 0x4000, {}, "test/filler"},
 	},
+	{
+		{"test/a/b/c/d/e/f/g/h/i/j/k/l/m", 0x4000, {}, "test/a/b/c/d/e/f/g/h/i/j/k/l/m"},
+		{"test/a", 0x4000, {}, "test/a.1"},
+		{"test/a/b", 0x4000, {}, "test/a/b.1"},
+		{"test/a/b/c", 0x4000, {}, "test/a/b/c.1"},
+		{"test/a/b/c/d", 0x4000, {}, "test/a/b/c/d.1"},
+		{"test/a/b/c/d/e", 0x4000, {}, "test/a/b/c/d/e.1"},
+		{"test/a/b/c/d/e/f", 0x4000, {}, "test/a/b/c/d/e/f.1"},
+		{"test/a/b/c/d/e/f/g", 0x4000, {}, "test/a/b/c/d/e/f/g.1"},
+		{"test/a/b/c/d/e/f/g/h", 0x4000, {}, "test/a/b/c/d/e/f/g/h.1"},
+		{"test/a/b/c/d/e/f/g/h/i", 0x4000, {}, "test/a/b/c/d/e/f/g/h/i.1"},
+		{"test/a/b/c/d/e/f/g/h/i/j", 0x4000, {}, "test/a/b/c/d/e/f/g/h/i/j.1"},
+	},
+	{
+		// it doesn't matter whether the file comes before the directory,
+		// directories take precedence
+		{"test/a", 0x4000, {}, "test/a.1"},
+		{"test/a/b", 0x4000, {}, "test/a/b"},
+	},
 	{
 		{"test/A/tmp", 0x4000, {}, "test/A/tmp"},
 		// a file may not have the same name as a directory