optimized loading and parsing torrents. Specifically detecting duplicate filenames (optimizing the common case of no collisions). Expanded unit test to uncover an unsupported case (not fixed yet). cutting down on memory allocation and copying when parsing torrent files in memory

This commit is contained in:
Arvid Norberg 2015-02-17 02:08:47 +00:00
parent 3d47a1fb81
commit d49e97afb3
8 changed files with 261 additions and 39 deletions

View File

@ -136,6 +136,8 @@ namespace libtorrent
void set_name(char const* n, bool borrow_string = false, int string_len = 0);
std::string filename() const;
char const* filename_ptr() const { return name; }
int filename_len() const { return name_len == name_is_owned?strlen(name):name_len; }
enum {
name_is_owned = (1<<12)-1,
@ -457,6 +459,13 @@ namespace libtorrent
bool pad_file_at(int index) const;
boost::int64_t file_offset(int index) const;
// returns the crc32 hash of file_path(index)
boost::uint32_t file_path_hash(int index, std::string const& save_path) const;
// returns the crc32 hash of the path at index. Note, this index does not
// refer to a file, but to a path in the vector returned by paths().
boost::uint32_t path_hash(int index, std::string const& save_path) const;
// flags indicating various attributes for files in
// a file_storage.
enum file_flags_t
@ -493,6 +502,7 @@ namespace libtorrent
// filesystem by making them all point to the same filename, but with
// different file bases, so that they don't overlap.
// torrent_info::remap_files() can be used to use a new file layout.
// TODO: 3 deprecate the file_base feature
boost::int64_t file_base(int index) const;
void set_file_base(int index, boost::int64_t off);
@ -544,7 +554,8 @@ namespace libtorrent
// the number of pieces in the torrent
int m_num_pieces;
void update_path_index(internal_file_entry& e);
void update_path_index(internal_file_entry& e, std::string const& path
, bool set_name = true);
void reorder_file(int index, int dst);
// the list of files that this torrent consists of

View File

@ -677,6 +677,9 @@ namespace libtorrent
void resolve_duplicate_filenames();
// the slow path, in case we detect/suspect a name collision
void resolve_duplicate_filenames_slow();
#if TORRENT_USE_INVARIANT_CHECKS
friend class invariant_access;
void check_invariant() const;

View File

@ -35,9 +35,16 @@ POSSIBILITY OF SUCH DAMAGE.
#include "libtorrent/file.hpp"
#include "libtorrent/utf8.hpp"
#include <boost/bind.hpp>
#include <boost/crc.hpp>
#include <cstdio>
#include <algorithm>
#if defined(TORRENT_WINDOWS) || defined(TORRENT_OS2)
#define TORRENT_SEPARATOR '\\'
#else
#define TORRENT_SEPARATOR '/'
#endif
namespace libtorrent
{
file_storage::file_storage()
@ -70,43 +77,56 @@ namespace libtorrent
return piece_length();
}
void file_storage::update_path_index(internal_file_entry& e)
namespace
{
std::string fn = e.filename();
if (is_complete(fn))
bool compare_string(char const* str, int len, std::string const& str2)
{
if (str2.size() != len) return false;
return memcmp(str2.c_str(), str, len) == 0;
}
}
// path is not supposed to include the name of the torrent itself.
void file_storage::update_path_index(internal_file_entry& e
, std::string const& path, bool set_name)
{
if (is_complete(path))
{
TORRENT_ASSERT(set_name);
e.set_name(path.c_str());
e.path_index = -2;
return;
}
TORRENT_ASSERT(path[0] != '/');
// sorry about this messy string handling, but I did
// profile it, and it was expensive
char const* leaf = filename_cstr(fn.c_str());
char const* leaf = filename_cstr(path.c_str());
char const* branch_path = "";
if (leaf > fn.c_str())
int branch_len = 0;
if (leaf > path.c_str())
{
// split the string into the leaf filename
// and the branch path
branch_path = fn.c_str();
((char*)leaf)[-1] = 0;
branch_path = path.c_str();
branch_len = leaf - path.c_str();
}
if (branch_path[0] == 0)
if (branch_len <= 0)
{
if (set_name) e.set_name(leaf);
e.path_index = -1;
return;
}
int branch_len = strlen(branch_path);
if (branch_len >= m_name.size()
&& std::memcmp(branch_path, m_name.c_str(), m_name.size()) == 0
&& (branch_len == m_name.size()
#ifdef TORRENT_WINDOWS
|| branch_path[m_name.size()] == '\\'
#endif
|| branch_path[m_name.size()] == '/'
))
&& std::memcmp(branch_path, m_name.c_str(), m_name.size()) == 0)
{
branch_path += m_name.size()
// the +1 is to skip the trailing '/' (or '\')
int offset = m_name.size()
+ (m_name.size() == branch_len?0:1);
branch_path += offset;
branch_len -= offset;
e.no_root_dir = false;
}
else
@ -116,20 +136,29 @@ namespace libtorrent
// do we already have this path in the path list?
std::vector<std::string>::reverse_iterator p
= std::find(m_paths.rbegin(), m_paths.rend(), branch_path);
= std::find_if(m_paths.rbegin(), m_paths.rend()
, boost::bind(&compare_string, branch_path, branch_len, _1));
if (p == m_paths.rend())
{
// no, we don't. add it
e.path_index = m_paths.size();
m_paths.push_back(branch_path);
TORRENT_ASSERT(branch_path[0] != '/');
// trim trailing slashes
if (branch_len > 0 && branch_path[branch_len-1] == TORRENT_SEPARATOR)
--branch_len;
// poor man's emplace back
m_paths.resize(m_paths.size() + 1);
m_paths.back().assign(branch_path, branch_len);
}
else
{
// yes we do. use it
e.path_index = p.base() - m_paths.begin() - 1;
}
e.set_name(leaf);
if (set_name) e.set_name(leaf);
}
#ifndef TORRENT_NO_DEPRECATE
@ -260,8 +289,7 @@ namespace libtorrent
TORRENT_ASSERT_PRECOND(index >= 0 && index < int(m_files.size()));
std::string utf8;
wchar_utf8(new_filename, utf8);
m_files[index].set_name(utf8.c_str());
update_path_index(m_files[index]);
update_path_index(m_files[index], utf8);
}
void file_storage::add_file(std::wstring const& file, boost::int64_t file_size
@ -282,8 +310,7 @@ namespace libtorrent
void file_storage::rename_file(int index, std::string const& new_filename)
{
TORRENT_ASSERT_PRECOND(index >= 0 && index < int(m_files.size()));
m_files[index].set_name(new_filename.c_str());
update_path_index(m_files[index]);
update_path_index(m_files[index], new_filename);
}
namespace
@ -468,15 +495,17 @@ namespace libtorrent
if (m_files.empty())
m_name = split_path(path).c_str();
}
internal_file_entry ife;
m_files.push_back(ife);
// this is poor-man's emplace_back()
m_files.resize(m_files.size() + 1);
internal_file_entry& e = m_files.back();
// first set the filename to the full path so update_path_index()
// can do its thing, then rename it to point to the borrowed filename
// pointer
e.set_name(path.c_str());
update_path_index(e);
// the last argument specified whether the function should also set
// the filename. If it does, it will copy the leaf filename from path.
// if filename is NULL, we should copy it. If it isn't, we're borrowing
// it and we can save the copy by setting it after this call to
// update_path_index().
update_path_index(e, path, filename == NULL);
// filename is allowed to be NULL, in which case we just use path
if (filename)
@ -547,6 +576,104 @@ namespace libtorrent
return m_file_base[index];
}
namespace
{
template <class CRC>
void process_string_lowercase(CRC& crc, char const* str, int len)
{
for (int i = 0; i < len; ++i, ++str)
crc.process_byte(to_lower(*str));
}
}
boost::uint32_t file_storage::path_hash(int index
, std::string const& save_path) const
{
TORRENT_ASSERT_PRECOND(index >= 0 && index < int(m_paths.size()));
boost::crc_optimal<32, 0x1EDC6F41, 0xFFFFFFFF, 0xFFFFFFFF, true, true> crc;
if (!save_path.empty())
{
process_string_lowercase(crc, save_path.c_str(), save_path.size());
TORRENT_ASSERT(save_path[save_path.size()-1] != TORRENT_SEPARATOR);
crc.process_byte(TORRENT_SEPARATOR);
}
process_string_lowercase(crc, m_name.c_str(), m_name.size());
crc.process_byte(TORRENT_SEPARATOR);
process_string_lowercase(crc, m_paths[index].c_str(), m_paths[index].size());
return crc.checksum();
}
boost::uint32_t file_storage::file_path_hash(int index
, std::string const& save_path) const
{
TORRENT_ASSERT_PRECOND(index >= 0 && index < int(m_files.size()));
internal_file_entry const& fe = m_files[index];
boost::crc_optimal<32, 0x1EDC6F41, 0xFFFFFFFF, 0xFFFFFFFF, true, true> crc;
if (fe.path_index == -2)
{
// -2 means this is an absolute path filename
process_string_lowercase(crc, fe.filename_ptr(), fe.filename_len());
}
else if (fe.path_index == -1)
{
// -1 means no path
if (!save_path.empty())
{
process_string_lowercase(crc, save_path.c_str(), save_path.size());
TORRENT_ASSERT(save_path[save_path.size()-1] != TORRENT_SEPARATOR);
crc.process_byte(TORRENT_SEPARATOR);
}
process_string_lowercase(crc, fe.filename_ptr(), fe.filename_len());
}
else if (fe.no_root_dir)
{
if (!save_path.empty())
{
process_string_lowercase(crc, save_path.c_str(), save_path.size());
TORRENT_ASSERT(save_path[save_path.size()-1] != TORRENT_SEPARATOR);
crc.process_byte(TORRENT_SEPARATOR);
}
std::string const& p = m_paths[fe.path_index];
if (!p.empty())
{
process_string_lowercase(crc, p.c_str(), p.size());
TORRENT_ASSERT(p[p.size()-1] != TORRENT_SEPARATOR);
crc.process_byte(TORRENT_SEPARATOR);
}
process_string_lowercase(crc, fe.filename_ptr(), fe.filename_len());
}
else
{
if (!save_path.empty())
{
process_string_lowercase(crc, save_path.c_str(), save_path.size());
TORRENT_ASSERT(save_path[save_path.size()-1] != TORRENT_SEPARATOR);
crc.process_byte(TORRENT_SEPARATOR);
}
process_string_lowercase(crc, m_name.c_str(), m_name.size());
TORRENT_ASSERT(m_name.size() > 0);
TORRENT_ASSERT(m_name[m_name.size()-1] != TORRENT_SEPARATOR);
crc.process_byte(TORRENT_SEPARATOR);
std::string const& p = m_paths[fe.path_index];
if (!p.empty())
{
process_string_lowercase(crc, p.c_str(), p.size());
TORRENT_ASSERT(p.size() > 0);
TORRENT_ASSERT(p[p.size()-1] != TORRENT_SEPARATOR);
crc.process_byte(TORRENT_SEPARATOR);
}
process_string_lowercase(crc, fe.filename_ptr(), fe.filename_len());
}
return crc.checksum();
}
std::string file_storage::file_path(int index, std::string const& save_path) const
{
TORRENT_ASSERT_PRECOND(index >= 0 && index < int(m_files.size()));

View File

@ -357,9 +357,11 @@ namespace libtorrent
if (path.empty()) path = "_";
}
// top level is extracting the file for a single-file torrent. The
// 'top_level' is extracting the file for a single-file torrent. The
// distinction is that the filename is found in "name" rather than
// "path"
// root_dir is the name of the torrent, unless this is a single file
// torrent, in which case it's empty.
bool extract_single_file(lazy_entry const& dict, file_storage& files
, std::string const& root_dir, ptrdiff_t info_ptr_diff, bool top_level
, error_code& ec)
@ -374,8 +376,6 @@ namespace libtorrent
boost::int64_t mtime = dict.dict_find_int_value("mtime", 0);
// prefer the name.utf-8 because if it exists, it is more likely to be
// correctly encoded
std::string path = root_dir;
std::string path_element;
char const* filename = NULL;
@ -383,6 +383,8 @@ namespace libtorrent
if (top_level)
{
// prefer the name.utf-8 because if it exists, it is more likely to be
// correctly encoded
lazy_entry const* p = dict.dict_find_string("name.utf-8");
if (p == 0) p = dict.dict_find_string("name");
if (p == 0 || p->string_length() == 0)
@ -407,6 +409,7 @@ namespace libtorrent
return false;
}
int preallocate = path.size();
for (int i = 0, end(p->list_size()); i < end; ++i)
{
lazy_entry const* e = p->list_at(i);
@ -415,6 +418,13 @@ namespace libtorrent
ec = errors::torrent_missing_name;
return false;
}
preallocate += e->string_length() + 1;
}
path.reserve(preallocate);
for (int i = 0, end(p->list_size()); i < end; ++i)
{
lazy_entry const* e = p->list_at(i);
if (i == end - 1)
{
filename = e->string_ptr() + info_ptr_diff;
@ -538,6 +548,8 @@ namespace libtorrent
};
#endif
// root_dir is the name of the torrent, unless this is a single file
// torrent, in which case it's empty.
bool extract_files(lazy_entry const& list, file_storage& target
, std::string const& root_dir, ptrdiff_t info_ptr_diff, error_code& ec)
{
@ -752,6 +764,42 @@ namespace libtorrent
void torrent_info::resolve_duplicate_filenames()
{
INVARIANT_CHECK;
#if TORRENT_HAS_BOOST_UNORDERED
boost::unordered_set<boost::uint32_t> files;
#else
std::set<boost::uint32_t> files;
#endif
std::string empty_str;
// insert all directories first, to make sure no files
// are allowed to collied with them
std::vector<std::string> const& paths = m_files.paths();
for (int i = 0; i != int(paths.size()); ++i)
{
files.insert(m_files.path_hash(i, empty_str));
}
for (int i = 0; i < m_files.num_files(); ++i)
{
// as long as this file already exists
// increase the counter
boost::uint32_t h = m_files.file_path_hash(i, empty_str);
if (!files.insert(h).second)
{
// This filename appears to already exist!
// If this happens, just start over and do it the slow way,
// comparing full file names and come up with new names
resolve_duplicate_filenames_slow();
return;
}
}
}
void torrent_info::resolve_duplicate_filenames_slow()
{
INVARIANT_CHECK;
int cnt = 0;
#if TORRENT_HAS_BOOST_UNORDERED
@ -759,7 +807,9 @@ namespace libtorrent
#else
std::set<std::string, string_less_no_case> files;
#endif
std::vector<std::string> const& paths = m_files.paths();
files.reserve(paths.size() + m_files.num_files());
// insert all directories first, to make sure no files
// are allowed to collied with them

View File

@ -117,6 +117,8 @@ namespace libtorrent { namespace
virtual void on_files_checked()
{
// TODO: 2 if we were to initialize m_metadata_size lazily instead,
// we would probably be more efficient
// initialize m_metadata_size
metadata();
}

View File

@ -166,6 +166,7 @@ int test_main()
}
{
// test map_file
file_storage fs;
fs.set_piece_length(512);
fs.add_file(combine_path("temp_storage", "test1.tmp"), 17);
@ -191,6 +192,29 @@ int test_main()
TEST_EQUAL(rq.length, 841);
}
{
// test file_path_hash and path_hash. Make sure we can detect a path
// whose name collides with
file_storage fs;
fs.set_piece_length(512);
fs.add_file(combine_path("temp_storage", combine_path("foo", "test1")), 17);
fs.add_file(combine_path("temp_storage", "foo"), 612);
fprintf(stderr, "path: %s\n", fs.paths()[0].c_str());
fprintf(stderr, "file: %s\n", fs.file_path(1).c_str());
boost::uint32_t file_hash = fs.file_path_hash(1, "a");
boost::uint32_t path_hash = fs.path_hash(0, "a");
TEST_EQUAL(file_hash, path_hash);
}
// TODO: test map_block
// TODO: test piece_size(int piece)
// TODO: test file_index_at_offset
// TODO: test file attributes
// TODO: test symlinks
// TODO: test pad_files
// TODO: test reorder_file (make sure internal_file_entry::swap() is used)
return 0;
}

View File

@ -84,7 +84,7 @@ struct mock_peer_connection : peer_connection_interface
virtual tcp::endpoint const& remote() const { return m_remote; }
virtual tcp::endpoint local_endpoint() const { return ep("127.0.0.1", 8080); }
virtual void disconnect(error_code const& ec
, peer_connection_interface::operation_t op, int error = 0)
, operation_t op, int error = 0)
{ /* remove from mock_torrent list */ m_tp = 0; }
virtual peer_id const& pid() const { return m_id; }
virtual void set_holepunch_mode() {}

View File

@ -533,7 +533,7 @@ int test_torrent_parse()
return 0;
}
void test_storage()
void test_resolve_duplicates()
{
file_storage fs;
@ -547,6 +547,8 @@ void test_storage()
fs.add_file("test/B.exe", 0x4000);
fs.add_file("test/test/TEMPORARY.TXT", 0x4000);
fs.add_file("test/A", 0x4000);
fs.add_file("test/long/path/name/that/collides", 0x4000);
fs.add_file("test/long/path", 0x4000);
libtorrent::create_torrent t(fs, 0x4000);
@ -564,7 +566,7 @@ void test_storage()
torrent_info ti(&tmp[0], tmp.size());
char const* filenames[10] =
char const* filenames[] =
{
"test/temporary.txt",
"test/A/tmp",
@ -576,6 +578,8 @@ void test_storage()
"test/B.2.exe", // duplicate of b.exe
"test/test/TEMPORARY.TXT", // a file with the same name in a seprate directory is fine
"test/A.2", // duplicate of directory a
"test/long/path/name/that/collides", // a subset of this path collides with the next filename
"test/long/path.1" // so this file needs to be renamed, to not collide with the path name
};
for (int i = 0; i < ti.num_files(); ++i)
@ -645,9 +649,10 @@ void test_copy()
int test_main()
{
test_storage();
test_resolve_duplicates();
test_copy();
test_torrent_parse();
return 0;
}