forked from premiere/premiere-libtorrent
optimize resolve_duplicate_filenames_slow()
This commit is contained in:
parent
212ea3db27
commit
4441655bab
|
@ -1,3 +1,4 @@
|
|||
* optimize resolving duplicate filenames in loading torrent files
|
||||
* fix python binding of dht_settings
|
||||
* tighten up various input validation checks
|
||||
* fix create_torrent python binding
|
||||
|
|
|
@ -121,15 +121,6 @@ namespace libtorrent {
|
|||
TORRENT_EXTRA_EXPORT bool is_i2p_url(std::string const& url);
|
||||
|
||||
#endif
|
||||
|
||||
// this can be used as the hash function in std::unordered_*
|
||||
struct TORRENT_EXTRA_EXPORT string_hash_no_case
|
||||
{ size_t operator()(std::string const& s) const; };
|
||||
|
||||
// these can be used as the comparison functions in std::map and std::set
|
||||
struct TORRENT_EXTRA_EXPORT string_eq_no_case
|
||||
{ bool operator()(std::string const& lhs, std::string const& rhs) const; };
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -379,28 +379,4 @@ namespace libtorrent {
|
|||
}
|
||||
|
||||
#endif
|
||||
|
||||
std::size_t string_hash_no_case::operator()(std::string const& s) const
|
||||
{
|
||||
std::size_t ret = 5381;
|
||||
for (auto const c : s)
|
||||
ret = (ret * 33) ^ static_cast<std::size_t>(to_lower(c));
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool string_eq_no_case::operator()(std::string const& lhs, std::string const& rhs) const
|
||||
{
|
||||
if (lhs.size() != rhs.size()) return false;
|
||||
|
||||
auto s1 = lhs.cbegin();
|
||||
auto s2 = rhs.cbegin();
|
||||
|
||||
while (s1 != lhs.end() && s2 != rhs.end())
|
||||
{
|
||||
if (to_lower(*s1) != to_lower(*s2)) return false;
|
||||
++s1;
|
||||
++s2;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -59,6 +59,10 @@ POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "libtorrent/lazy_entry.hpp"
|
||||
#endif
|
||||
|
||||
#include "libtorrent/aux_/disable_warnings_push.hpp"
|
||||
#include <boost/crc.hpp>
|
||||
#include "libtorrent/aux_/disable_warnings_pop.hpp"
|
||||
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <cstdint>
|
||||
|
@ -640,58 +644,117 @@ namespace {
|
|||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
template <class CRC>
|
||||
void process_string_lowercase(CRC& crc, string_view str)
|
||||
{
|
||||
for (char const c : str)
|
||||
crc.process_byte(to_lower(c) & 0xff);
|
||||
}
|
||||
|
||||
struct name_entry
|
||||
{
|
||||
file_index_t idx;
|
||||
int length;
|
||||
};
|
||||
}
|
||||
|
||||
void torrent_info::resolve_duplicate_filenames_slow()
|
||||
{
|
||||
INVARIANT_CHECK;
|
||||
|
||||
std::unordered_map<std::string, file_index_t, string_hash_no_case, string_eq_no_case> files;
|
||||
// maps filename hash to file index
|
||||
// or, if the file_index is negative, maps into the paths vector
|
||||
std::unordered_multimap<std::uint32_t, name_entry> files;
|
||||
|
||||
std::vector<std::string> const& paths = m_files.paths();
|
||||
files.reserve(paths.size() + aux::numeric_cast<std::size_t>(m_files.num_files()));
|
||||
|
||||
// insert all directories first, to make sure no files
|
||||
// are allowed to collied with them
|
||||
for (auto const& i : paths)
|
||||
{
|
||||
std::string p = combine_path(m_files.name(), i);
|
||||
files.insert({p, file_index_t{-1}});
|
||||
while (has_parent_path(p))
|
||||
boost::crc_optimal<32, 0x1EDC6F41, 0xFFFFFFFF, 0xFFFFFFFF, true, true> crc;
|
||||
if (!m_files.name().empty())
|
||||
{
|
||||
p = parent_path(std::move(p));
|
||||
// we don't want trailing slashes here
|
||||
TORRENT_ASSERT(p[p.size() - 1] == TORRENT_SEPARATOR);
|
||||
p.resize(p.size() - 1);
|
||||
files.insert({p, file_index_t{-1}});
|
||||
process_string_lowercase(crc, m_files.name());
|
||||
}
|
||||
file_index_t path_index{-1};
|
||||
for (auto const& path : paths)
|
||||
{
|
||||
auto local_crc = crc;
|
||||
if (!path.empty()) local_crc.process_byte(TORRENT_SEPARATOR);
|
||||
int count = 0;
|
||||
for (char const c : path)
|
||||
{
|
||||
if (c == TORRENT_SEPARATOR)
|
||||
files.insert({local_crc.checksum(), {path_index, count}});
|
||||
local_crc.process_byte(to_lower(c) & 0xff);
|
||||
++count;
|
||||
}
|
||||
files.insert({local_crc.checksum(), {path_index, int(path.size())}});
|
||||
--path_index;
|
||||
}
|
||||
}
|
||||
|
||||
// keep track of the total number of name collisions. If there are too
|
||||
// many, it's probably a malicious torrent and we should just fail
|
||||
int num_collisions = 0;
|
||||
for (auto const i : m_files.file_range())
|
||||
{
|
||||
// as long as this file already exists
|
||||
// increase the counter
|
||||
std::string filename = m_files.file_path(i);
|
||||
auto const ret = files.insert({filename, i});
|
||||
if (ret.second) continue;
|
||||
std::uint32_t const hash = m_files.file_path_hash(i, "");
|
||||
auto range = files.equal_range(hash);
|
||||
auto const match = std::find_if(range.first, range.second, [&](std::pair<std::uint32_t, name_entry> const& o)
|
||||
{
|
||||
std::string const other_name = o.second.idx < file_index_t{}
|
||||
? combine_path(m_files.name(), paths[std::size_t(-static_cast<int>(o.second.idx)-1)].substr(0, std::size_t(o.second.length)))
|
||||
: m_files.file_path(o.second.idx);
|
||||
return string_equal_no_case(other_name, m_files.file_path(i));
|
||||
});
|
||||
|
||||
if (match == range.second)
|
||||
{
|
||||
files.insert({hash, {i, 0}});
|
||||
continue;
|
||||
}
|
||||
|
||||
// pad files are allowed to collide with each-other, as long as they have
|
||||
// the same size.
|
||||
file_index_t const other_idx = ret.first->second;
|
||||
if (other_idx != file_index_t{-1}
|
||||
file_index_t const other_idx = match->second.idx;
|
||||
if (other_idx >= file_index_t{}
|
||||
&& (m_files.file_flags(i) & file_storage::flag_pad_file)
|
||||
&& (m_files.file_flags(other_idx) & file_storage::flag_pad_file)
|
||||
&& m_files.file_size(i) == m_files.file_size(other_idx))
|
||||
continue;
|
||||
|
||||
std::string filename = m_files.file_path(i);
|
||||
std::string base = remove_extension(filename);
|
||||
std::string ext = extension(filename);
|
||||
int cnt = 0;
|
||||
do
|
||||
for (;;)
|
||||
{
|
||||
++cnt;
|
||||
char new_ext[50];
|
||||
std::snprintf(new_ext, sizeof(new_ext), ".%d%s", cnt, ext.c_str());
|
||||
filename = base + new_ext;
|
||||
|
||||
boost::crc_optimal<32, 0x1EDC6F41, 0xFFFFFFFF, 0xFFFFFFFF, true, true> crc;
|
||||
process_string_lowercase(crc, filename);
|
||||
std::uint32_t const new_hash = crc.checksum();
|
||||
if (files.find(new_hash) == files.end())
|
||||
{
|
||||
files.insert({new_hash, {i, 0}});
|
||||
break;
|
||||
}
|
||||
++num_collisions;
|
||||
if (num_collisions > 100)
|
||||
{
|
||||
// TODO: this should be considered a failure, and the .torrent file
|
||||
// rejected
|
||||
}
|
||||
}
|
||||
while (!files.insert({filename, i}).second);
|
||||
|
||||
copy_on_write();
|
||||
m_files.rename_file(i, filename);
|
||||
|
|
|
@ -455,41 +455,6 @@ TORRENT_TEST(i2p_url)
|
|||
}
|
||||
#endif
|
||||
|
||||
TORRENT_TEST(string_hash_no_case)
|
||||
{
|
||||
string_hash_no_case hsh;
|
||||
|
||||
// make sure different strings yield different hashes
|
||||
TEST_CHECK(hsh("ab") != hsh("ba"));
|
||||
|
||||
// make sure case is ignored
|
||||
TEST_EQUAL(hsh("Ab"), hsh("ab"));
|
||||
TEST_EQUAL(hsh("Ab"), hsh("aB"));
|
||||
|
||||
// make sure zeroes in strings are supported
|
||||
TEST_CHECK(hsh(std::string("\0a", 2)) != hsh(std::string("\0b", 2)));
|
||||
TEST_EQUAL(hsh(std::string("\0a", 2)), hsh(std::string("\0a", 2)));
|
||||
}
|
||||
|
||||
TORRENT_TEST(string_eq_no_case)
|
||||
{
|
||||
string_eq_no_case cmp;
|
||||
TEST_CHECK(cmp("ab", "ba") == false);
|
||||
TEST_CHECK(cmp("", ""));
|
||||
TEST_CHECK(cmp("abc", "abc"));
|
||||
|
||||
// make sure different lengths are correctly treated as different
|
||||
TEST_CHECK(cmp("abc", "ab") == false);
|
||||
|
||||
// make sure case is ignored
|
||||
TEST_CHECK(cmp("Ab", "ab"));
|
||||
TEST_CHECK(cmp("Ab", "aB"));
|
||||
|
||||
// make sure zeros are supported
|
||||
TEST_CHECK(cmp(std::string("\0a", 2), std::string("\0b", 2)) == false);
|
||||
TEST_CHECK(cmp(std::string("\0a", 2), std::string("\0a", 2)));
|
||||
}
|
||||
|
||||
TORRENT_TEST(string_ptr_zero_termination)
|
||||
{
|
||||
char str[] = {'f', 'o', 'o', 'b', 'a', 'r'};
|
||||
|
|
|
@ -909,6 +909,25 @@ std::vector<lt::aux::vector<file_t, lt::file_index_t>> const test_cases
|
|||
{"test/B.exe", 0x4000, {}, "test/B.2.exe"},
|
||||
{"test/filler", 0x4000, {}, "test/filler"},
|
||||
},
|
||||
{
|
||||
{"test/a/b/c/d/e/f/g/h/i/j/k/l/m", 0x4000, {}, "test/a/b/c/d/e/f/g/h/i/j/k/l/m"},
|
||||
{"test/a", 0x4000, {}, "test/a.1"},
|
||||
{"test/a/b", 0x4000, {}, "test/a/b.1"},
|
||||
{"test/a/b/c", 0x4000, {}, "test/a/b/c.1"},
|
||||
{"test/a/b/c/d", 0x4000, {}, "test/a/b/c/d.1"},
|
||||
{"test/a/b/c/d/e", 0x4000, {}, "test/a/b/c/d/e.1"},
|
||||
{"test/a/b/c/d/e/f", 0x4000, {}, "test/a/b/c/d/e/f.1"},
|
||||
{"test/a/b/c/d/e/f/g", 0x4000, {}, "test/a/b/c/d/e/f/g.1"},
|
||||
{"test/a/b/c/d/e/f/g/h", 0x4000, {}, "test/a/b/c/d/e/f/g/h.1"},
|
||||
{"test/a/b/c/d/e/f/g/h/i", 0x4000, {}, "test/a/b/c/d/e/f/g/h/i.1"},
|
||||
{"test/a/b/c/d/e/f/g/h/i/j", 0x4000, {}, "test/a/b/c/d/e/f/g/h/i/j.1"},
|
||||
},
|
||||
{
|
||||
// it doesn't matter whether the file comes before the directory,
|
||||
// directories take precedence
|
||||
{"test/a", 0x4000, {}, "test/a.1"},
|
||||
{"test/a/b", 0x4000, {}, "test/a/b"},
|
||||
},
|
||||
{
|
||||
{"test/A/tmp", 0x4000, {}, "test/A/tmp"},
|
||||
// a file may not have the same name as a directory
|
||||
|
|
Loading…
Reference in New Issue