optimize resolve_duplicate_filenames_slow()

This commit is contained in:
arvidn 2019-04-05 01:27:46 +02:00 committed by Arvid Norberg
parent 212ea3db27
commit 4441655bab
6 changed files with 100 additions and 85 deletions

View File

@ -1,3 +1,4 @@
* optimize resolving duplicate filenames in loading torrent files
* fix python binding of dht_settings
* tighten up various input validation checks
* fix create_torrent python binding

View File

@ -121,15 +121,6 @@ namespace libtorrent {
TORRENT_EXTRA_EXPORT bool is_i2p_url(std::string const& url);
#endif
// this can be used as the hash function in std::unordered_*
struct TORRENT_EXTRA_EXPORT string_hash_no_case
{ size_t operator()(std::string const& s) const; };
// these can be used as the comparison functions in std::map and std::set
struct TORRENT_EXTRA_EXPORT string_eq_no_case
{ bool operator()(std::string const& lhs, std::string const& rhs) const; };
}
#endif

View File

@ -379,28 +379,4 @@ namespace libtorrent {
}
#endif
std::size_t string_hash_no_case::operator()(std::string const& s) const
{
std::size_t ret = 5381;
for (auto const c : s)
ret = (ret * 33) ^ static_cast<std::size_t>(to_lower(c));
return ret;
}
bool string_eq_no_case::operator()(std::string const& lhs, std::string const& rhs) const
{
if (lhs.size() != rhs.size()) return false;
auto s1 = lhs.cbegin();
auto s2 = rhs.cbegin();
while (s1 != lhs.end() && s2 != rhs.end())
{
if (to_lower(*s1) != to_lower(*s2)) return false;
++s1;
++s2;
}
return true;
}
}

View File

@ -59,6 +59,10 @@ POSSIBILITY OF SUCH DAMAGE.
#include "libtorrent/lazy_entry.hpp"
#endif
#include "libtorrent/aux_/disable_warnings_push.hpp"
#include <boost/crc.hpp>
#include "libtorrent/aux_/disable_warnings_pop.hpp"
#include <unordered_map>
#include <unordered_set>
#include <cstdint>
@ -640,58 +644,117 @@ namespace {
}
}
namespace {
template <class CRC>
void process_string_lowercase(CRC& crc, string_view str)
{
for (char const c : str)
crc.process_byte(to_lower(c) & 0xff);
}
struct name_entry
{
file_index_t idx;
int length;
};
}
void torrent_info::resolve_duplicate_filenames_slow()
{
INVARIANT_CHECK;
std::unordered_map<std::string, file_index_t, string_hash_no_case, string_eq_no_case> files;
// maps filename hash to file index
// or, if the file_index is negative, maps into the paths vector
std::unordered_multimap<std::uint32_t, name_entry> files;
std::vector<std::string> const& paths = m_files.paths();
files.reserve(paths.size() + aux::numeric_cast<std::size_t>(m_files.num_files()));
// insert all directories first, to make sure no files
// are allowed to collied with them
for (auto const& i : paths)
{
std::string p = combine_path(m_files.name(), i);
files.insert({p, file_index_t{-1}});
while (has_parent_path(p))
boost::crc_optimal<32, 0x1EDC6F41, 0xFFFFFFFF, 0xFFFFFFFF, true, true> crc;
if (!m_files.name().empty())
{
p = parent_path(std::move(p));
// we don't want trailing slashes here
TORRENT_ASSERT(p[p.size() - 1] == TORRENT_SEPARATOR);
p.resize(p.size() - 1);
files.insert({p, file_index_t{-1}});
process_string_lowercase(crc, m_files.name());
}
file_index_t path_index{-1};
for (auto const& path : paths)
{
auto local_crc = crc;
if (!path.empty()) local_crc.process_byte(TORRENT_SEPARATOR);
int count = 0;
for (char const c : path)
{
if (c == TORRENT_SEPARATOR)
files.insert({local_crc.checksum(), {path_index, count}});
local_crc.process_byte(to_lower(c) & 0xff);
++count;
}
files.insert({local_crc.checksum(), {path_index, int(path.size())}});
--path_index;
}
}
// keep track of the total number of name collisions. If there are too
// many, it's probably a malicious torrent and we should just fail
int num_collisions = 0;
for (auto const i : m_files.file_range())
{
// as long as this file already exists
// increase the counter
std::string filename = m_files.file_path(i);
auto const ret = files.insert({filename, i});
if (ret.second) continue;
std::uint32_t const hash = m_files.file_path_hash(i, "");
auto range = files.equal_range(hash);
auto const match = std::find_if(range.first, range.second, [&](std::pair<std::uint32_t, name_entry> const& o)
{
std::string const other_name = o.second.idx < file_index_t{}
? combine_path(m_files.name(), paths[std::size_t(-static_cast<int>(o.second.idx)-1)].substr(0, std::size_t(o.second.length)))
: m_files.file_path(o.second.idx);
return string_equal_no_case(other_name, m_files.file_path(i));
});
if (match == range.second)
{
files.insert({hash, {i, 0}});
continue;
}
// pad files are allowed to collide with each-other, as long as they have
// the same size.
file_index_t const other_idx = ret.first->second;
if (other_idx != file_index_t{-1}
file_index_t const other_idx = match->second.idx;
if (other_idx >= file_index_t{}
&& (m_files.file_flags(i) & file_storage::flag_pad_file)
&& (m_files.file_flags(other_idx) & file_storage::flag_pad_file)
&& m_files.file_size(i) == m_files.file_size(other_idx))
continue;
std::string filename = m_files.file_path(i);
std::string base = remove_extension(filename);
std::string ext = extension(filename);
int cnt = 0;
do
for (;;)
{
++cnt;
char new_ext[50];
std::snprintf(new_ext, sizeof(new_ext), ".%d%s", cnt, ext.c_str());
filename = base + new_ext;
boost::crc_optimal<32, 0x1EDC6F41, 0xFFFFFFFF, 0xFFFFFFFF, true, true> crc;
process_string_lowercase(crc, filename);
std::uint32_t const new_hash = crc.checksum();
if (files.find(new_hash) == files.end())
{
files.insert({new_hash, {i, 0}});
break;
}
++num_collisions;
if (num_collisions > 100)
{
// TODO: this should be considered a failure, and the .torrent file
// rejected
}
}
while (!files.insert({filename, i}).second);
copy_on_write();
m_files.rename_file(i, filename);

View File

@ -455,41 +455,6 @@ TORRENT_TEST(i2p_url)
}
#endif
TORRENT_TEST(string_hash_no_case)
{
string_hash_no_case hsh;
// make sure different strings yield different hashes
TEST_CHECK(hsh("ab") != hsh("ba"));
// make sure case is ignored
TEST_EQUAL(hsh("Ab"), hsh("ab"));
TEST_EQUAL(hsh("Ab"), hsh("aB"));
// make sure zeroes in strings are supported
TEST_CHECK(hsh(std::string("\0a", 2)) != hsh(std::string("\0b", 2)));
TEST_EQUAL(hsh(std::string("\0a", 2)), hsh(std::string("\0a", 2)));
}
TORRENT_TEST(string_eq_no_case)
{
string_eq_no_case cmp;
TEST_CHECK(cmp("ab", "ba") == false);
TEST_CHECK(cmp("", ""));
TEST_CHECK(cmp("abc", "abc"));
// make sure different lengths are correctly treated as different
TEST_CHECK(cmp("abc", "ab") == false);
// make sure case is ignored
TEST_CHECK(cmp("Ab", "ab"));
TEST_CHECK(cmp("Ab", "aB"));
// make sure zeros are supported
TEST_CHECK(cmp(std::string("\0a", 2), std::string("\0b", 2)) == false);
TEST_CHECK(cmp(std::string("\0a", 2), std::string("\0a", 2)));
}
TORRENT_TEST(string_ptr_zero_termination)
{
char str[] = {'f', 'o', 'o', 'b', 'a', 'r'};

View File

@ -909,6 +909,25 @@ std::vector<lt::aux::vector<file_t, lt::file_index_t>> const test_cases
{"test/B.exe", 0x4000, {}, "test/B.2.exe"},
{"test/filler", 0x4000, {}, "test/filler"},
},
{
{"test/a/b/c/d/e/f/g/h/i/j/k/l/m", 0x4000, {}, "test/a/b/c/d/e/f/g/h/i/j/k/l/m"},
{"test/a", 0x4000, {}, "test/a.1"},
{"test/a/b", 0x4000, {}, "test/a/b.1"},
{"test/a/b/c", 0x4000, {}, "test/a/b/c.1"},
{"test/a/b/c/d", 0x4000, {}, "test/a/b/c/d.1"},
{"test/a/b/c/d/e", 0x4000, {}, "test/a/b/c/d/e.1"},
{"test/a/b/c/d/e/f", 0x4000, {}, "test/a/b/c/d/e/f.1"},
{"test/a/b/c/d/e/f/g", 0x4000, {}, "test/a/b/c/d/e/f/g.1"},
{"test/a/b/c/d/e/f/g/h", 0x4000, {}, "test/a/b/c/d/e/f/g/h.1"},
{"test/a/b/c/d/e/f/g/h/i", 0x4000, {}, "test/a/b/c/d/e/f/g/h/i.1"},
{"test/a/b/c/d/e/f/g/h/i/j", 0x4000, {}, "test/a/b/c/d/e/f/g/h/i/j.1"},
},
{
// it doesn't matter whether the file comes before the directory,
// directories take precedence
{"test/a", 0x4000, {}, "test/a.1"},
{"test/a/b", 0x4000, {}, "test/a/b"},
},
{
{"test/A/tmp", 0x4000, {}, "test/A/tmp"},
// a file may not have the same name as a directory