From 4441655bab09a9063942bc50f7fc3c3b424a68c0 Mon Sep 17 00:00:00 2001 From: arvidn Date: Fri, 5 Apr 2019 01:27:46 +0200 Subject: [PATCH] optimize resolve_duplicate_filenames_slow() --- ChangeLog | 1 + include/libtorrent/string_util.hpp | 9 --- src/string_util.cpp | 24 -------- src/torrent_info.cpp | 97 ++++++++++++++++++++++++------ test/test_string.cpp | 35 ----------- test/test_torrent_info.cpp | 19 ++++++ 6 files changed, 100 insertions(+), 85 deletions(-) diff --git a/ChangeLog b/ChangeLog index 4b8f08bad..210f288d5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,4 @@ + * optimize resolving duplicate filenames in loading torrent files * fix python binding of dht_settings * tighten up various input validation checks * fix create_torrent python binding diff --git a/include/libtorrent/string_util.hpp b/include/libtorrent/string_util.hpp index 5efd8d7a2..0a9f69158 100644 --- a/include/libtorrent/string_util.hpp +++ b/include/libtorrent/string_util.hpp @@ -121,15 +121,6 @@ namespace libtorrent { TORRENT_EXTRA_EXPORT bool is_i2p_url(std::string const& url); #endif - - // this can be used as the hash function in std::unordered_* - struct TORRENT_EXTRA_EXPORT string_hash_no_case - { size_t operator()(std::string const& s) const; }; - - // these can be used as the comparison functions in std::map and std::set - struct TORRENT_EXTRA_EXPORT string_eq_no_case - { bool operator()(std::string const& lhs, std::string const& rhs) const; }; - } #endif diff --git a/src/string_util.cpp b/src/string_util.cpp index 8f4a242a6..25939e97c 100644 --- a/src/string_util.cpp +++ b/src/string_util.cpp @@ -379,28 +379,4 @@ namespace libtorrent { } #endif - - std::size_t string_hash_no_case::operator()(std::string const& s) const - { - std::size_t ret = 5381; - for (auto const c : s) - ret = (ret * 33) ^ static_cast(to_lower(c)); - return ret; - } - - bool string_eq_no_case::operator()(std::string const& lhs, std::string const& rhs) const - { - if (lhs.size() != rhs.size()) return false; - - auto s1 = lhs.cbegin(); - auto s2 = rhs.cbegin(); - - while (s1 != lhs.end() && s2 != rhs.end()) - { - if (to_lower(*s1) != to_lower(*s2)) return false; - ++s1; - ++s2; - } - return true; - } } diff --git a/src/torrent_info.cpp b/src/torrent_info.cpp index eb124a3b1..d8cd7db99 100644 --- a/src/torrent_info.cpp +++ b/src/torrent_info.cpp @@ -59,6 +59,10 @@ POSSIBILITY OF SUCH DAMAGE. #include "libtorrent/lazy_entry.hpp" #endif +#include "libtorrent/aux_/disable_warnings_push.hpp" +#include +#include "libtorrent/aux_/disable_warnings_pop.hpp" + #include #include #include @@ -640,58 +644,117 @@ namespace { } } +namespace { + + template + void process_string_lowercase(CRC& crc, string_view str) + { + for (char const c : str) + crc.process_byte(to_lower(c) & 0xff); + } + + struct name_entry + { + file_index_t idx; + int length; + }; +} + void torrent_info::resolve_duplicate_filenames_slow() { INVARIANT_CHECK; - std::unordered_map files; + // maps filename hash to file index + // or, if the file_index is negative, maps into the paths vector + std::unordered_multimap files; std::vector const& paths = m_files.paths(); files.reserve(paths.size() + aux::numeric_cast(m_files.num_files())); // insert all directories first, to make sure no files // are allowed to collied with them - for (auto const& i : paths) { - std::string p = combine_path(m_files.name(), i); - files.insert({p, file_index_t{-1}}); - while (has_parent_path(p)) + boost::crc_optimal<32, 0x1EDC6F41, 0xFFFFFFFF, 0xFFFFFFFF, true, true> crc; + if (!m_files.name().empty()) { - p = parent_path(std::move(p)); - // we don't want trailing slashes here - TORRENT_ASSERT(p[p.size() - 1] == TORRENT_SEPARATOR); - p.resize(p.size() - 1); - files.insert({p, file_index_t{-1}}); + process_string_lowercase(crc, m_files.name()); + } + file_index_t path_index{-1}; + for (auto const& path : paths) + { + auto local_crc = crc; + if (!path.empty()) local_crc.process_byte(TORRENT_SEPARATOR); + int count = 0; + for (char const c : path) + { + if (c == TORRENT_SEPARATOR) + files.insert({local_crc.checksum(), {path_index, count}}); + local_crc.process_byte(to_lower(c) & 0xff); + ++count; + } + files.insert({local_crc.checksum(), {path_index, int(path.size())}}); + --path_index; } } + // keep track of the total number of name collisions. If there are too + // many, it's probably a malicious torrent and we should just fail + int num_collisions = 0; for (auto const i : m_files.file_range()) { // as long as this file already exists // increase the counter - std::string filename = m_files.file_path(i); - auto const ret = files.insert({filename, i}); - if (ret.second) continue; + std::uint32_t const hash = m_files.file_path_hash(i, ""); + auto range = files.equal_range(hash); + auto const match = std::find_if(range.first, range.second, [&](std::pair const& o) + { + std::string const other_name = o.second.idx < file_index_t{} + ? combine_path(m_files.name(), paths[std::size_t(-static_cast(o.second.idx)-1)].substr(0, std::size_t(o.second.length))) + : m_files.file_path(o.second.idx); + return string_equal_no_case(other_name, m_files.file_path(i)); + }); + + if (match == range.second) + { + files.insert({hash, {i, 0}}); + continue; + } + // pad files are allowed to collide with each-other, as long as they have // the same size. - file_index_t const other_idx = ret.first->second; - if (other_idx != file_index_t{-1} + file_index_t const other_idx = match->second.idx; + if (other_idx >= file_index_t{} && (m_files.file_flags(i) & file_storage::flag_pad_file) && (m_files.file_flags(other_idx) & file_storage::flag_pad_file) && m_files.file_size(i) == m_files.file_size(other_idx)) continue; + std::string filename = m_files.file_path(i); std::string base = remove_extension(filename); std::string ext = extension(filename); int cnt = 0; - do + for (;;) { ++cnt; char new_ext[50]; std::snprintf(new_ext, sizeof(new_ext), ".%d%s", cnt, ext.c_str()); filename = base + new_ext; + + boost::crc_optimal<32, 0x1EDC6F41, 0xFFFFFFFF, 0xFFFFFFFF, true, true> crc; + process_string_lowercase(crc, filename); + std::uint32_t const new_hash = crc.checksum(); + if (files.find(new_hash) == files.end()) + { + files.insert({new_hash, {i, 0}}); + break; + } + ++num_collisions; + if (num_collisions > 100) + { + // TODO: this should be considered a failure, and the .torrent file + // rejected + } } - while (!files.insert({filename, i}).second); copy_on_write(); m_files.rename_file(i, filename); diff --git a/test/test_string.cpp b/test/test_string.cpp index e9796f191..1807ea3ce 100644 --- a/test/test_string.cpp +++ b/test/test_string.cpp @@ -455,41 +455,6 @@ TORRENT_TEST(i2p_url) } #endif -TORRENT_TEST(string_hash_no_case) -{ - string_hash_no_case hsh; - - // make sure different strings yield different hashes - TEST_CHECK(hsh("ab") != hsh("ba")); - - // make sure case is ignored - TEST_EQUAL(hsh("Ab"), hsh("ab")); - TEST_EQUAL(hsh("Ab"), hsh("aB")); - - // make sure zeroes in strings are supported - TEST_CHECK(hsh(std::string("\0a", 2)) != hsh(std::string("\0b", 2))); - TEST_EQUAL(hsh(std::string("\0a", 2)), hsh(std::string("\0a", 2))); -} - -TORRENT_TEST(string_eq_no_case) -{ - string_eq_no_case cmp; - TEST_CHECK(cmp("ab", "ba") == false); - TEST_CHECK(cmp("", "")); - TEST_CHECK(cmp("abc", "abc")); - - // make sure different lengths are correctly treated as different - TEST_CHECK(cmp("abc", "ab") == false); - - // make sure case is ignored - TEST_CHECK(cmp("Ab", "ab")); - TEST_CHECK(cmp("Ab", "aB")); - - // make sure zeros are supported - TEST_CHECK(cmp(std::string("\0a", 2), std::string("\0b", 2)) == false); - TEST_CHECK(cmp(std::string("\0a", 2), std::string("\0a", 2))); -} - TORRENT_TEST(string_ptr_zero_termination) { char str[] = {'f', 'o', 'o', 'b', 'a', 'r'}; diff --git a/test/test_torrent_info.cpp b/test/test_torrent_info.cpp index c0c6863b7..741fa33f5 100644 --- a/test/test_torrent_info.cpp +++ b/test/test_torrent_info.cpp @@ -909,6 +909,25 @@ std::vector> const test_cases {"test/B.exe", 0x4000, {}, "test/B.2.exe"}, {"test/filler", 0x4000, {}, "test/filler"}, }, + { + {"test/a/b/c/d/e/f/g/h/i/j/k/l/m", 0x4000, {}, "test/a/b/c/d/e/f/g/h/i/j/k/l/m"}, + {"test/a", 0x4000, {}, "test/a.1"}, + {"test/a/b", 0x4000, {}, "test/a/b.1"}, + {"test/a/b/c", 0x4000, {}, "test/a/b/c.1"}, + {"test/a/b/c/d", 0x4000, {}, "test/a/b/c/d.1"}, + {"test/a/b/c/d/e", 0x4000, {}, "test/a/b/c/d/e.1"}, + {"test/a/b/c/d/e/f", 0x4000, {}, "test/a/b/c/d/e/f.1"}, + {"test/a/b/c/d/e/f/g", 0x4000, {}, "test/a/b/c/d/e/f/g.1"}, + {"test/a/b/c/d/e/f/g/h", 0x4000, {}, "test/a/b/c/d/e/f/g/h.1"}, + {"test/a/b/c/d/e/f/g/h/i", 0x4000, {}, "test/a/b/c/d/e/f/g/h/i.1"}, + {"test/a/b/c/d/e/f/g/h/i/j", 0x4000, {}, "test/a/b/c/d/e/f/g/h/i/j.1"}, + }, + { + // it doesn't matter whether the file comes before the directory, + // directories take precedence + {"test/a", 0x4000, {}, "test/a.1"}, + {"test/a/b", 0x4000, {}, "test/a/b"}, + }, { {"test/A/tmp", 0x4000, {}, "test/A/tmp"}, // a file may not have the same name as a directory