forked from premiere/premiere-libtorrent
optimized loading and parsing torrents. Specifically detecting duplicate filenames (optimizing the common case of no collisions). Expanded unit test to uncover an unsupported case (not fixed yet). cutting down on memory allocation and copying when parsing torrent files in memory
This commit is contained in:
parent
3d47a1fb81
commit
d49e97afb3
|
@ -136,6 +136,8 @@ namespace libtorrent
|
|||
|
||||
void set_name(char const* n, bool borrow_string = false, int string_len = 0);
|
||||
std::string filename() const;
|
||||
char const* filename_ptr() const { return name; }
|
||||
int filename_len() const { return name_len == name_is_owned?strlen(name):name_len; }
|
||||
|
||||
enum {
|
||||
name_is_owned = (1<<12)-1,
|
||||
|
@ -457,6 +459,13 @@ namespace libtorrent
|
|||
bool pad_file_at(int index) const;
|
||||
boost::int64_t file_offset(int index) const;
|
||||
|
||||
// returns the crc32 hash of file_path(index)
|
||||
boost::uint32_t file_path_hash(int index, std::string const& save_path) const;
|
||||
|
||||
// returns the crc32 hash of the path at index. Note, this index does not
|
||||
// refer to a file, but to a path in the vector returned by paths().
|
||||
boost::uint32_t path_hash(int index, std::string const& save_path) const;
|
||||
|
||||
// flags indicating various attributes for files in
|
||||
// a file_storage.
|
||||
enum file_flags_t
|
||||
|
@ -493,6 +502,7 @@ namespace libtorrent
|
|||
// filesystem by making them all point to the same filename, but with
|
||||
// different file bases, so that they don't overlap.
|
||||
// torrent_info::remap_files() can be used to use a new file layout.
|
||||
// TODO: 3 deprecate the file_base feature
|
||||
boost::int64_t file_base(int index) const;
|
||||
void set_file_base(int index, boost::int64_t off);
|
||||
|
||||
|
@ -544,7 +554,8 @@ namespace libtorrent
|
|||
// the number of pieces in the torrent
|
||||
int m_num_pieces;
|
||||
|
||||
void update_path_index(internal_file_entry& e);
|
||||
void update_path_index(internal_file_entry& e, std::string const& path
|
||||
, bool set_name = true);
|
||||
void reorder_file(int index, int dst);
|
||||
|
||||
// the list of files that this torrent consists of
|
||||
|
|
|
@ -677,6 +677,9 @@ namespace libtorrent
|
|||
|
||||
void resolve_duplicate_filenames();
|
||||
|
||||
// the slow path, in case we detect/suspect a name collision
|
||||
void resolve_duplicate_filenames_slow();
|
||||
|
||||
#if TORRENT_USE_INVARIANT_CHECKS
|
||||
friend class invariant_access;
|
||||
void check_invariant() const;
|
||||
|
|
|
@ -35,9 +35,16 @@ POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "libtorrent/file.hpp"
|
||||
#include "libtorrent/utf8.hpp"
|
||||
#include <boost/bind.hpp>
|
||||
#include <boost/crc.hpp>
|
||||
#include <cstdio>
|
||||
#include <algorithm>
|
||||
|
||||
#if defined(TORRENT_WINDOWS) || defined(TORRENT_OS2)
|
||||
#define TORRENT_SEPARATOR '\\'
|
||||
#else
|
||||
#define TORRENT_SEPARATOR '/'
|
||||
#endif
|
||||
|
||||
namespace libtorrent
|
||||
{
|
||||
file_storage::file_storage()
|
||||
|
@ -70,43 +77,56 @@ namespace libtorrent
|
|||
return piece_length();
|
||||
}
|
||||
|
||||
void file_storage::update_path_index(internal_file_entry& e)
|
||||
namespace
|
||||
{
|
||||
std::string fn = e.filename();
|
||||
if (is_complete(fn))
|
||||
bool compare_string(char const* str, int len, std::string const& str2)
|
||||
{
|
||||
if (str2.size() != len) return false;
|
||||
return memcmp(str2.c_str(), str, len) == 0;
|
||||
}
|
||||
}
|
||||
|
||||
// path is not supposed to include the name of the torrent itself.
|
||||
void file_storage::update_path_index(internal_file_entry& e
|
||||
, std::string const& path, bool set_name)
|
||||
{
|
||||
if (is_complete(path))
|
||||
{
|
||||
TORRENT_ASSERT(set_name);
|
||||
e.set_name(path.c_str());
|
||||
e.path_index = -2;
|
||||
return;
|
||||
}
|
||||
|
||||
TORRENT_ASSERT(path[0] != '/');
|
||||
|
||||
// sorry about this messy string handling, but I did
|
||||
// profile it, and it was expensive
|
||||
char const* leaf = filename_cstr(fn.c_str());
|
||||
char const* leaf = filename_cstr(path.c_str());
|
||||
char const* branch_path = "";
|
||||
if (leaf > fn.c_str())
|
||||
int branch_len = 0;
|
||||
if (leaf > path.c_str())
|
||||
{
|
||||
// split the string into the leaf filename
|
||||
// and the branch path
|
||||
branch_path = fn.c_str();
|
||||
((char*)leaf)[-1] = 0;
|
||||
branch_path = path.c_str();
|
||||
branch_len = leaf - path.c_str();
|
||||
}
|
||||
if (branch_path[0] == 0)
|
||||
if (branch_len <= 0)
|
||||
{
|
||||
if (set_name) e.set_name(leaf);
|
||||
e.path_index = -1;
|
||||
return;
|
||||
}
|
||||
|
||||
int branch_len = strlen(branch_path);
|
||||
if (branch_len >= m_name.size()
|
||||
&& std::memcmp(branch_path, m_name.c_str(), m_name.size()) == 0
|
||||
&& (branch_len == m_name.size()
|
||||
#ifdef TORRENT_WINDOWS
|
||||
|| branch_path[m_name.size()] == '\\'
|
||||
#endif
|
||||
|| branch_path[m_name.size()] == '/'
|
||||
))
|
||||
&& std::memcmp(branch_path, m_name.c_str(), m_name.size()) == 0)
|
||||
{
|
||||
branch_path += m_name.size()
|
||||
// the +1 is to skip the trailing '/' (or '\')
|
||||
int offset = m_name.size()
|
||||
+ (m_name.size() == branch_len?0:1);
|
||||
branch_path += offset;
|
||||
branch_len -= offset;
|
||||
e.no_root_dir = false;
|
||||
}
|
||||
else
|
||||
|
@ -116,20 +136,29 @@ namespace libtorrent
|
|||
|
||||
// do we already have this path in the path list?
|
||||
std::vector<std::string>::reverse_iterator p
|
||||
= std::find(m_paths.rbegin(), m_paths.rend(), branch_path);
|
||||
= std::find_if(m_paths.rbegin(), m_paths.rend()
|
||||
, boost::bind(&compare_string, branch_path, branch_len, _1));
|
||||
|
||||
if (p == m_paths.rend())
|
||||
{
|
||||
// no, we don't. add it
|
||||
e.path_index = m_paths.size();
|
||||
m_paths.push_back(branch_path);
|
||||
TORRENT_ASSERT(branch_path[0] != '/');
|
||||
|
||||
// trim trailing slashes
|
||||
if (branch_len > 0 && branch_path[branch_len-1] == TORRENT_SEPARATOR)
|
||||
--branch_len;
|
||||
|
||||
// poor man's emplace back
|
||||
m_paths.resize(m_paths.size() + 1);
|
||||
m_paths.back().assign(branch_path, branch_len);
|
||||
}
|
||||
else
|
||||
{
|
||||
// yes we do. use it
|
||||
e.path_index = p.base() - m_paths.begin() - 1;
|
||||
}
|
||||
e.set_name(leaf);
|
||||
if (set_name) e.set_name(leaf);
|
||||
}
|
||||
|
||||
#ifndef TORRENT_NO_DEPRECATE
|
||||
|
@ -260,8 +289,7 @@ namespace libtorrent
|
|||
TORRENT_ASSERT_PRECOND(index >= 0 && index < int(m_files.size()));
|
||||
std::string utf8;
|
||||
wchar_utf8(new_filename, utf8);
|
||||
m_files[index].set_name(utf8.c_str());
|
||||
update_path_index(m_files[index]);
|
||||
update_path_index(m_files[index], utf8);
|
||||
}
|
||||
|
||||
void file_storage::add_file(std::wstring const& file, boost::int64_t file_size
|
||||
|
@ -282,8 +310,7 @@ namespace libtorrent
|
|||
void file_storage::rename_file(int index, std::string const& new_filename)
|
||||
{
|
||||
TORRENT_ASSERT_PRECOND(index >= 0 && index < int(m_files.size()));
|
||||
m_files[index].set_name(new_filename.c_str());
|
||||
update_path_index(m_files[index]);
|
||||
update_path_index(m_files[index], new_filename);
|
||||
}
|
||||
|
||||
namespace
|
||||
|
@ -468,15 +495,17 @@ namespace libtorrent
|
|||
if (m_files.empty())
|
||||
m_name = split_path(path).c_str();
|
||||
}
|
||||
internal_file_entry ife;
|
||||
m_files.push_back(ife);
|
||||
|
||||
// this is poor-man's emplace_back()
|
||||
m_files.resize(m_files.size() + 1);
|
||||
internal_file_entry& e = m_files.back();
|
||||
|
||||
// first set the filename to the full path so update_path_index()
|
||||
// can do its thing, then rename it to point to the borrowed filename
|
||||
// pointer
|
||||
e.set_name(path.c_str());
|
||||
update_path_index(e);
|
||||
// the last argument specified whether the function should also set
|
||||
// the filename. If it does, it will copy the leaf filename from path.
|
||||
// if filename is NULL, we should copy it. If it isn't, we're borrowing
|
||||
// it and we can save the copy by setting it after this call to
|
||||
// update_path_index().
|
||||
update_path_index(e, path, filename == NULL);
|
||||
|
||||
// filename is allowed to be NULL, in which case we just use path
|
||||
if (filename)
|
||||
|
@ -547,6 +576,104 @@ namespace libtorrent
|
|||
return m_file_base[index];
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
template <class CRC>
|
||||
void process_string_lowercase(CRC& crc, char const* str, int len)
|
||||
{
|
||||
for (int i = 0; i < len; ++i, ++str)
|
||||
crc.process_byte(to_lower(*str));
|
||||
}
|
||||
}
|
||||
|
||||
boost::uint32_t file_storage::path_hash(int index
|
||||
, std::string const& save_path) const
|
||||
{
|
||||
TORRENT_ASSERT_PRECOND(index >= 0 && index < int(m_paths.size()));
|
||||
|
||||
boost::crc_optimal<32, 0x1EDC6F41, 0xFFFFFFFF, 0xFFFFFFFF, true, true> crc;
|
||||
|
||||
if (!save_path.empty())
|
||||
{
|
||||
process_string_lowercase(crc, save_path.c_str(), save_path.size());
|
||||
TORRENT_ASSERT(save_path[save_path.size()-1] != TORRENT_SEPARATOR);
|
||||
crc.process_byte(TORRENT_SEPARATOR);
|
||||
}
|
||||
|
||||
process_string_lowercase(crc, m_name.c_str(), m_name.size());
|
||||
crc.process_byte(TORRENT_SEPARATOR);
|
||||
process_string_lowercase(crc, m_paths[index].c_str(), m_paths[index].size());
|
||||
return crc.checksum();
|
||||
}
|
||||
|
||||
boost::uint32_t file_storage::file_path_hash(int index
|
||||
, std::string const& save_path) const
|
||||
{
|
||||
TORRENT_ASSERT_PRECOND(index >= 0 && index < int(m_files.size()));
|
||||
internal_file_entry const& fe = m_files[index];
|
||||
|
||||
boost::crc_optimal<32, 0x1EDC6F41, 0xFFFFFFFF, 0xFFFFFFFF, true, true> crc;
|
||||
|
||||
if (fe.path_index == -2)
|
||||
{
|
||||
// -2 means this is an absolute path filename
|
||||
process_string_lowercase(crc, fe.filename_ptr(), fe.filename_len());
|
||||
}
|
||||
else if (fe.path_index == -1)
|
||||
{
|
||||
// -1 means no path
|
||||
if (!save_path.empty())
|
||||
{
|
||||
process_string_lowercase(crc, save_path.c_str(), save_path.size());
|
||||
TORRENT_ASSERT(save_path[save_path.size()-1] != TORRENT_SEPARATOR);
|
||||
crc.process_byte(TORRENT_SEPARATOR);
|
||||
}
|
||||
process_string_lowercase(crc, fe.filename_ptr(), fe.filename_len());
|
||||
}
|
||||
else if (fe.no_root_dir)
|
||||
{
|
||||
if (!save_path.empty())
|
||||
{
|
||||
process_string_lowercase(crc, save_path.c_str(), save_path.size());
|
||||
TORRENT_ASSERT(save_path[save_path.size()-1] != TORRENT_SEPARATOR);
|
||||
crc.process_byte(TORRENT_SEPARATOR);
|
||||
}
|
||||
std::string const& p = m_paths[fe.path_index];
|
||||
if (!p.empty())
|
||||
{
|
||||
process_string_lowercase(crc, p.c_str(), p.size());
|
||||
TORRENT_ASSERT(p[p.size()-1] != TORRENT_SEPARATOR);
|
||||
crc.process_byte(TORRENT_SEPARATOR);
|
||||
}
|
||||
process_string_lowercase(crc, fe.filename_ptr(), fe.filename_len());
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!save_path.empty())
|
||||
{
|
||||
process_string_lowercase(crc, save_path.c_str(), save_path.size());
|
||||
TORRENT_ASSERT(save_path[save_path.size()-1] != TORRENT_SEPARATOR);
|
||||
crc.process_byte(TORRENT_SEPARATOR);
|
||||
}
|
||||
process_string_lowercase(crc, m_name.c_str(), m_name.size());
|
||||
TORRENT_ASSERT(m_name.size() > 0);
|
||||
TORRENT_ASSERT(m_name[m_name.size()-1] != TORRENT_SEPARATOR);
|
||||
crc.process_byte(TORRENT_SEPARATOR);
|
||||
|
||||
std::string const& p = m_paths[fe.path_index];
|
||||
if (!p.empty())
|
||||
{
|
||||
process_string_lowercase(crc, p.c_str(), p.size());
|
||||
TORRENT_ASSERT(p.size() > 0);
|
||||
TORRENT_ASSERT(p[p.size()-1] != TORRENT_SEPARATOR);
|
||||
crc.process_byte(TORRENT_SEPARATOR);
|
||||
}
|
||||
process_string_lowercase(crc, fe.filename_ptr(), fe.filename_len());
|
||||
}
|
||||
|
||||
return crc.checksum();
|
||||
}
|
||||
|
||||
std::string file_storage::file_path(int index, std::string const& save_path) const
|
||||
{
|
||||
TORRENT_ASSERT_PRECOND(index >= 0 && index < int(m_files.size()));
|
||||
|
|
|
@ -357,9 +357,11 @@ namespace libtorrent
|
|||
if (path.empty()) path = "_";
|
||||
}
|
||||
|
||||
// top level is extracting the file for a single-file torrent. The
|
||||
// 'top_level' is extracting the file for a single-file torrent. The
|
||||
// distinction is that the filename is found in "name" rather than
|
||||
// "path"
|
||||
// root_dir is the name of the torrent, unless this is a single file
|
||||
// torrent, in which case it's empty.
|
||||
bool extract_single_file(lazy_entry const& dict, file_storage& files
|
||||
, std::string const& root_dir, ptrdiff_t info_ptr_diff, bool top_level
|
||||
, error_code& ec)
|
||||
|
@ -374,8 +376,6 @@ namespace libtorrent
|
|||
|
||||
boost::int64_t mtime = dict.dict_find_int_value("mtime", 0);
|
||||
|
||||
// prefer the name.utf-8 because if it exists, it is more likely to be
|
||||
// correctly encoded
|
||||
std::string path = root_dir;
|
||||
std::string path_element;
|
||||
char const* filename = NULL;
|
||||
|
@ -383,6 +383,8 @@ namespace libtorrent
|
|||
|
||||
if (top_level)
|
||||
{
|
||||
// prefer the name.utf-8 because if it exists, it is more likely to be
|
||||
// correctly encoded
|
||||
lazy_entry const* p = dict.dict_find_string("name.utf-8");
|
||||
if (p == 0) p = dict.dict_find_string("name");
|
||||
if (p == 0 || p->string_length() == 0)
|
||||
|
@ -407,6 +409,7 @@ namespace libtorrent
|
|||
return false;
|
||||
}
|
||||
|
||||
int preallocate = path.size();
|
||||
for (int i = 0, end(p->list_size()); i < end; ++i)
|
||||
{
|
||||
lazy_entry const* e = p->list_at(i);
|
||||
|
@ -415,6 +418,13 @@ namespace libtorrent
|
|||
ec = errors::torrent_missing_name;
|
||||
return false;
|
||||
}
|
||||
preallocate += e->string_length() + 1;
|
||||
}
|
||||
path.reserve(preallocate);
|
||||
|
||||
for (int i = 0, end(p->list_size()); i < end; ++i)
|
||||
{
|
||||
lazy_entry const* e = p->list_at(i);
|
||||
if (i == end - 1)
|
||||
{
|
||||
filename = e->string_ptr() + info_ptr_diff;
|
||||
|
@ -538,6 +548,8 @@ namespace libtorrent
|
|||
};
|
||||
#endif
|
||||
|
||||
// root_dir is the name of the torrent, unless this is a single file
|
||||
// torrent, in which case it's empty.
|
||||
bool extract_files(lazy_entry const& list, file_storage& target
|
||||
, std::string const& root_dir, ptrdiff_t info_ptr_diff, error_code& ec)
|
||||
{
|
||||
|
@ -752,6 +764,42 @@ namespace libtorrent
|
|||
void torrent_info::resolve_duplicate_filenames()
|
||||
{
|
||||
INVARIANT_CHECK;
|
||||
|
||||
#if TORRENT_HAS_BOOST_UNORDERED
|
||||
boost::unordered_set<boost::uint32_t> files;
|
||||
#else
|
||||
std::set<boost::uint32_t> files;
|
||||
#endif
|
||||
|
||||
std::string empty_str;
|
||||
|
||||
// insert all directories first, to make sure no files
|
||||
// are allowed to collied with them
|
||||
std::vector<std::string> const& paths = m_files.paths();
|
||||
for (int i = 0; i != int(paths.size()); ++i)
|
||||
{
|
||||
files.insert(m_files.path_hash(i, empty_str));
|
||||
}
|
||||
|
||||
for (int i = 0; i < m_files.num_files(); ++i)
|
||||
{
|
||||
// as long as this file already exists
|
||||
// increase the counter
|
||||
boost::uint32_t h = m_files.file_path_hash(i, empty_str);
|
||||
if (!files.insert(h).second)
|
||||
{
|
||||
// This filename appears to already exist!
|
||||
// If this happens, just start over and do it the slow way,
|
||||
// comparing full file names and come up with new names
|
||||
resolve_duplicate_filenames_slow();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void torrent_info::resolve_duplicate_filenames_slow()
|
||||
{
|
||||
INVARIANT_CHECK;
|
||||
int cnt = 0;
|
||||
|
||||
#if TORRENT_HAS_BOOST_UNORDERED
|
||||
|
@ -759,7 +807,9 @@ namespace libtorrent
|
|||
#else
|
||||
std::set<std::string, string_less_no_case> files;
|
||||
#endif
|
||||
|
||||
std::vector<std::string> const& paths = m_files.paths();
|
||||
files.reserve(paths.size() + m_files.num_files());
|
||||
|
||||
// insert all directories first, to make sure no files
|
||||
// are allowed to collied with them
|
||||
|
|
|
@ -117,6 +117,8 @@ namespace libtorrent { namespace
|
|||
|
||||
virtual void on_files_checked()
|
||||
{
|
||||
// TODO: 2 if we were to initialize m_metadata_size lazily instead,
|
||||
// we would probably be more efficient
|
||||
// initialize m_metadata_size
|
||||
metadata();
|
||||
}
|
||||
|
|
|
@ -166,6 +166,7 @@ int test_main()
|
|||
}
|
||||
|
||||
{
|
||||
// test map_file
|
||||
file_storage fs;
|
||||
fs.set_piece_length(512);
|
||||
fs.add_file(combine_path("temp_storage", "test1.tmp"), 17);
|
||||
|
@ -191,6 +192,29 @@ int test_main()
|
|||
TEST_EQUAL(rq.length, 841);
|
||||
}
|
||||
|
||||
{
|
||||
// test file_path_hash and path_hash. Make sure we can detect a path
|
||||
// whose name collides with
|
||||
file_storage fs;
|
||||
fs.set_piece_length(512);
|
||||
fs.add_file(combine_path("temp_storage", combine_path("foo", "test1")), 17);
|
||||
fs.add_file(combine_path("temp_storage", "foo"), 612);
|
||||
|
||||
fprintf(stderr, "path: %s\n", fs.paths()[0].c_str());
|
||||
fprintf(stderr, "file: %s\n", fs.file_path(1).c_str());
|
||||
boost::uint32_t file_hash = fs.file_path_hash(1, "a");
|
||||
boost::uint32_t path_hash = fs.path_hash(0, "a");
|
||||
TEST_EQUAL(file_hash, path_hash);
|
||||
}
|
||||
|
||||
// TODO: test map_block
|
||||
// TODO: test piece_size(int piece)
|
||||
// TODO: test file_index_at_offset
|
||||
// TODO: test file attributes
|
||||
// TODO: test symlinks
|
||||
// TODO: test pad_files
|
||||
// TODO: test reorder_file (make sure internal_file_entry::swap() is used)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -84,7 +84,7 @@ struct mock_peer_connection : peer_connection_interface
|
|||
virtual tcp::endpoint const& remote() const { return m_remote; }
|
||||
virtual tcp::endpoint local_endpoint() const { return ep("127.0.0.1", 8080); }
|
||||
virtual void disconnect(error_code const& ec
|
||||
, peer_connection_interface::operation_t op, int error = 0)
|
||||
, operation_t op, int error = 0)
|
||||
{ /* remove from mock_torrent list */ m_tp = 0; }
|
||||
virtual peer_id const& pid() const { return m_id; }
|
||||
virtual void set_holepunch_mode() {}
|
||||
|
|
|
@ -533,7 +533,7 @@ int test_torrent_parse()
|
|||
return 0;
|
||||
}
|
||||
|
||||
void test_storage()
|
||||
void test_resolve_duplicates()
|
||||
{
|
||||
file_storage fs;
|
||||
|
||||
|
@ -547,6 +547,8 @@ void test_storage()
|
|||
fs.add_file("test/B.exe", 0x4000);
|
||||
fs.add_file("test/test/TEMPORARY.TXT", 0x4000);
|
||||
fs.add_file("test/A", 0x4000);
|
||||
fs.add_file("test/long/path/name/that/collides", 0x4000);
|
||||
fs.add_file("test/long/path", 0x4000);
|
||||
|
||||
libtorrent::create_torrent t(fs, 0x4000);
|
||||
|
||||
|
@ -564,7 +566,7 @@ void test_storage()
|
|||
|
||||
torrent_info ti(&tmp[0], tmp.size());
|
||||
|
||||
char const* filenames[10] =
|
||||
char const* filenames[] =
|
||||
{
|
||||
"test/temporary.txt",
|
||||
"test/A/tmp",
|
||||
|
@ -576,6 +578,8 @@ void test_storage()
|
|||
"test/B.2.exe", // duplicate of b.exe
|
||||
"test/test/TEMPORARY.TXT", // a file with the same name in a seprate directory is fine
|
||||
"test/A.2", // duplicate of directory a
|
||||
"test/long/path/name/that/collides", // a subset of this path collides with the next filename
|
||||
"test/long/path.1" // so this file needs to be renamed, to not collide with the path name
|
||||
};
|
||||
|
||||
for (int i = 0; i < ti.num_files(); ++i)
|
||||
|
@ -645,9 +649,10 @@ void test_copy()
|
|||
|
||||
int test_main()
|
||||
{
|
||||
test_storage();
|
||||
test_resolve_duplicates();
|
||||
test_copy();
|
||||
test_torrent_parse();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue