From c84e96898b8a5b32175c1150c1b509222c0b5272 Mon Sep 17 00:00:00 2001 From: Arvid Norberg Date: Thu, 12 Oct 2006 23:51:10 +0000 Subject: [PATCH] added workaround for incorrectly encoded paths in torrent files --- ChangeLog | 2 + Makefile.am | 2 + docs/manual.html | 9 ++- docs/manual.rst | 10 ++- docs/ubuntu_build_notes.html | 12 ++-- docs/ubuntu_build_notes.rst | 12 ++-- include/libtorrent/torrent_info.hpp | 6 ++ src/torrent_info.cpp | 107 ++++++++++++++++++++++++++-- 8 files changed, 140 insertions(+), 20 deletions(-) diff --git a/ChangeLog b/ChangeLog index 21fedb5aa..8cfc868bf 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,5 @@ + * added support for incorrectly encoded paths in torrent files + (assumes Latin-1 encoding and converts to UTF-8). * fixed bug with file_progress() with files = 0 bytes * fixed a race condition bug in udp_tracker_connection that could cause a crash. diff --git a/Makefile.am b/Makefile.am index f82c216e9..1e5c40944 100644 --- a/Makefile.am +++ b/Makefile.am @@ -13,6 +13,8 @@ docs/qbittorrent_thumb.jpg \ docs/ziptorrent_thumb.gif \ docs/vs2005_build_notes.html \ docs/vs2005_build_notes.rst \ +docs/ubuntu_build_notes.html \ +docs/ubuntu_build_notes.rst \ docs/udp_tracker_protocol.html docs/client_test.rst docs/client_test.html \ docs/unicode_support.png docs/client_test.png docs/style.css Jamfile project-root.jam \ m4/ac_cxx_namespaces.m4 m4/acx_pthread.m4 m4/ax_boost_date-time.m4 \ diff --git a/docs/manual.html b/docs/manual.html index 6fb6d72c2..5b2198004 100755 --- a/docs/manual.html +++ b/docs/manual.html @@ -944,13 +944,20 @@ torrent, all the files starts with a directory with the same name as size is the size of the file (in bytes) and offset is the byte offset of the file within the torrent. i.e. the sum of all the sizes of the files -before this one in the file list this one in the file list..

+before this one in the file list this one in the file list.

+

orig_path is set to 0 in case the path element is an exact copy of that +found in the metadata. In case the path in the original metadata was +incorrectly encoded, and had to be fixed in order to be acceptable utf-8, +the original string is preserved in orig_path. The reason to keep it +is to be able to reproduce the info-section exactly, with the correct +info-hash.

 struct file_entry
 {
         boost::filesystem::path path;
         size_type offset;
         size_type size;
+        boost::scoped_ptr<boost::filesystem::path> orig_path;
 };
 
diff --git a/docs/manual.rst b/docs/manual.rst index f2ea7bf2f..839100f4c 100755 --- a/docs/manual.rst +++ b/docs/manual.rst @@ -854,7 +854,14 @@ The filenames are encoded with UTF-8. ``size`` is the size of the file (in bytes) and ``offset`` is the byte offset of the file within the torrent. i.e. the sum of all the sizes of the files -before this one in the file list this one in the file list.. +before this one in the file list this one in the file list. + +``orig_path`` is set to 0 in case the path element is an exact copy of that +found in the metadata. In case the path in the original metadata was +incorrectly encoded, and had to be fixed in order to be acceptable utf-8, +the original string is preserved in ``orig_path``. The reason to keep it +is to be able to reproduce the info-section exactly, with the correct +info-hash. :: @@ -863,6 +870,7 @@ before this one in the file list this one in the file list.. boost::filesystem::path path; size_type offset; size_type size; + boost::shared_ptr orig_path; }; diff --git a/docs/ubuntu_build_notes.html b/docs/ubuntu_build_notes.html index 617904c16..22241e61d 100644 --- a/docs/ubuntu_build_notes.html +++ b/docs/ubuntu_build_notes.html @@ -47,13 +47,13 @@ cvs -d:pserver:anonymous@boost.cvs.sourceforge.net:/cvsroot/boost login cvs -z3 -d:pserver:anonymous@boost.cvs.sourceforge.net:/cvsroot/boost checkout boost cvs -d:pserver:anonymous@boost.cvs.sourceforge.net:/cvsroot/boost logout -cvs -d:pserver:anonym...@libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent login -cvs -z3 -d:pserver:anonym...@libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent co -P libtorrent -cvs -d:pserver:anonym...@libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent logout +cvs -d:pserver:anonymous@libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent login +cvs -z3 -d:pserver:anonymous@libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent co -P libtorrent +cvs -d:pserver:anonymous@libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent logout -cvs -d:pserver:anonym...@asio.cvs.sourceforge.net:/cvsroot/asio login -cvs -z3 -d:pserver:anonym...@asio.cvs.sourceforge.net:/cvsroot/asio co -P asio -cvs -d:pserver:anonym...@asio.cvs.sourceforge.net:/cvsroot/asio login +cvs -d:pserver:anonymous@asio.cvs.sourceforge.net:/cvsroot/asio login +cvs -z3 -d:pserver:anonymous@asio.cvs.sourceforge.net:/cvsroot/asio co -P asio +cvs -d:pserver:anonymous@asio.cvs.sourceforge.net:/cvsroot/asio login
diff --git a/docs/ubuntu_build_notes.rst b/docs/ubuntu_build_notes.rst index 7ad7c877b..59bb11fe0 100644 --- a/docs/ubuntu_build_notes.rst +++ b/docs/ubuntu_build_notes.rst @@ -30,13 +30,13 @@ by executing the following commands:: cvs -z3 -d:pserver:anonymous@boost.cvs.sourceforge.net:/cvsroot/boost checkout boost cvs -d:pserver:anonymous@boost.cvs.sourceforge.net:/cvsroot/boost logout - cvs -d:pserver:anonym...@libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent login - cvs -z3 -d:pserver:anonym...@libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent co -P libtorrent - cvs -d:pserver:anonym...@libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent logout + cvs -d:pserver:anonymous@libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent login + cvs -z3 -d:pserver:anonymous@libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent co -P libtorrent + cvs -d:pserver:anonymous@libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent logout - cvs -d:pserver:anonym...@asio.cvs.sourceforge.net:/cvsroot/asio login - cvs -z3 -d:pserver:anonym...@asio.cvs.sourceforge.net:/cvsroot/asio co -P asio - cvs -d:pserver:anonym...@asio.cvs.sourceforge.net:/cvsroot/asio login + cvs -d:pserver:anonymous@asio.cvs.sourceforge.net:/cvsroot/asio login + cvs -z3 -d:pserver:anonymous@asio.cvs.sourceforge.net:/cvsroot/asio co -P asio + cvs -d:pserver:anonymous@asio.cvs.sourceforge.net:/cvsroot/asio login Step 2: Building boost ====================== diff --git a/include/libtorrent/torrent_info.hpp b/include/libtorrent/torrent_info.hpp index 2a7fce751..43de675f9 100755 --- a/include/libtorrent/torrent_info.hpp +++ b/include/libtorrent/torrent_info.hpp @@ -45,6 +45,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #ifdef _MSC_VER #pragma warning(pop) @@ -65,6 +66,11 @@ namespace libtorrent boost::filesystem::path path; size_type offset; // the offset of this file inside the torrent size_type size; // the size of this file + // if the path was incorrectly encoded, this is + // the origianal corrupt encoded string. It is + // preserved in order to be able to reproduce + // the correct info-hash + boost::shared_ptr orig_path; }; struct TORRENT_EXPORT file_slice diff --git a/src/torrent_info.cpp b/src/torrent_info.cpp index 243d1d866..6b008369b 100755 --- a/src/torrent_info.cpp +++ b/src/torrent_info.cpp @@ -62,6 +62,98 @@ using namespace boost::filesystem; namespace { + void convert_to_utf8(std::string& str, unsigned char chr) + { + str += 0xc0 | ((chr & 0xff) >> 6); + str += 0x80 | (chr & 0x3f); + } + + void verify_encoding(file_entry& target) + { + std::string tmp_path; + std::string file_path = target.path.string(); + bool valid_encoding = true; + for (std::string::iterator i = file_path.begin() + , end(file_path.end()); i != end; ++i) + { + // valid ascii-character + if ((*i & 0x80) == 0) + { + tmp_path += *i; + continue; + } + + if (std::distance(i, end) < 2) + { + convert_to_utf8(tmp_path, *i); + valid_encoding = false; + continue; + } + + // valid 2-byte utf-8 character + if ((i[0] & 0xe0) == 0xc0 + && (i[1] & 0xc0) == 0x80) + { + tmp_path += i[0]; + tmp_path += i[1]; + i += 1; + continue; + } + + if (std::distance(i, end) < 3) + { + convert_to_utf8(tmp_path, *i); + valid_encoding = false; + continue; + } + + // valid 3-byte utf-8 character + if ((i[0] & 0xf0) == 0xe0 + && (i[1] & 0xc0) == 0x80 + && (i[2] & 0xc0) == 0x80) + { + tmp_path += i[0]; + tmp_path += i[1]; + tmp_path += i[2]; + i += 2; + continue; + } + + if (std::distance(i, end) < 4) + { + convert_to_utf8(tmp_path, *i); + valid_encoding = false; + continue; + } + + // valid 4-byte utf-8 character + if ((i[0] & 0xf0) == 0xe0 + && (i[1] & 0xc0) == 0x80 + && (i[2] & 0xc0) == 0x80 + && (i[3] & 0xc0) == 0x80) + { + tmp_path += i[0]; + tmp_path += i[1]; + tmp_path += i[2]; + tmp_path += i[3]; + i += 3; + continue; + } + + convert_to_utf8(tmp_path, *i); + valid_encoding = false; + } + // the encoding was not valid utf-8 + // save the original encoding and replace the + // commonly used path with the correctly + // encoded string + if (!valid_encoding) + { + target.orig_path.reset(new path(target.path)); + target.path = tmp_path; + } + } + void extract_single_file(const entry& dict, file_entry& target , std::string const& root_dir) { @@ -89,6 +181,7 @@ namespace if (i->string() != "..") target.path /= i->string(); } + verify_encoding(target); if (target.path.is_complete()) throw std::runtime_error("torrent contains " "a file with an absolute path: '" + target.path.native_file_string() + "'"); @@ -501,7 +594,7 @@ namespace libtorrent files = entry(entry::list_t); for (std::vector::const_iterator i = m_files.begin(); - i != m_files.end(); ++i) + i != m_files.end(); ++i) { files.list().push_back(entry(entry::dictionary_t)); entry& file_e = files.list().back(); @@ -509,12 +602,14 @@ namespace libtorrent entry& path_e = file_e["path"]; path_e = entry(entry::list_t); - fs::path const& file_path(i->path); - assert(file_path.has_branch_path()); - assert(*file_path.begin() == m_name); + fs::path const* file_path; + if (i->orig_path) file_path = &(*i->orig_path); + else file_path = &i->path; + assert(file_path->has_branch_path()); + assert(*file_path->begin() == m_name); - for (fs::path::iterator j = boost::next(file_path.begin()); - j != file_path.end(); ++j) + for (fs::path::iterator j = boost::next(file_path->begin()); + j != file_path->end(); ++j) { path_e.list().push_back(entry(*j)); }