added workaround for incorrectly encoded paths in torrent files

2006-10-12 23:51:10 +00:00 · 2006-10-12 23:51:10 +00:00 · c84e96898b
parent 7eb6090a08
commit c84e96898b
8 changed files with 140 additions and 20 deletions
--- a/2
+++ b/2
@ -1,3 +1,5 @@
+	* added support for incorrectly encoded paths in torrent files
+	  (assumes Latin-1 encoding and converts to UTF-8).
 	* fixed bug with file_progress() with files = 0 bytes
 	* fixed a race condition bug in udp_tracker_connection that could
 	  cause a crash.
--- a/Makefile.am
+++ b/Makefile.am
@ -13,6 +13,8 @@ docs/qbittorrent_thumb.jpg \
 docs/ziptorrent_thumb.gif \
 docs/vs2005_build_notes.html \
 docs/vs2005_build_notes.rst \
+docs/ubuntu_build_notes.html \
+docs/ubuntu_build_notes.rst \
 docs/udp_tracker_protocol.html docs/client_test.rst docs/client_test.html \
 docs/unicode_support.png docs/client_test.png docs/style.css Jamfile project-root.jam \
 m4/ac_cxx_namespaces.m4 m4/acx_pthread.m4 m4/ax_boost_date-time.m4 \
--- a/docs/manual.html
+++ b/docs/manual.html
@ -944,13 +944,20 @@ torrent, all the files starts with a directory with the same name as <tt class="
 The filenames are encoded with UTF-8.</p>
 <p><tt class="docutils literal"><span class="pre">size</span></tt> is the size of the file (in bytes) and <tt class="docutils literal"><span class="pre">offset</span></tt> is the byte offset
 of the file within the torrent. i.e. the sum of all the sizes of the files
-before this one in the file list this one in the file list..</p>
+before this one in the file list this one in the file list.</p>
+<p><tt class="docutils literal"><span class="pre">orig_path</span></tt> is set to 0 in case the path element is an exact copy of that
+found in the metadata. In case the path in the original metadata was
+incorrectly encoded, and had to be fixed in order to be acceptable utf-8,
+the original string is preserved in <tt class="docutils literal"><span class="pre">orig_path</span></tt>. The reason to keep it
+is to be able to reproduce the info-section exactly, with the correct
+info-hash.</p>
 <pre class="literal-block">
 struct file_entry
 {
        boost::filesystem::path path;
        size_type offset;
        size_type size;
+        boost::scoped_ptr&lt;boost::filesystem::path&gt; orig_path;
 };
 </pre>
 </div>
--- a/docs/manual.rst
+++ b/docs/manual.rst
@ -854,7 +854,14 @@ The filenames are encoded with UTF-8.

 ``size`` is the size of the file (in bytes) and ``offset`` is the byte offset
 of the file within the torrent. i.e. the sum of all the sizes of the files
-before this one in the file list this one in the file list..
+before this one in the file list this one in the file list.
+
+``orig_path`` is set to 0 in case the path element is an exact copy of that
+found in the metadata. In case the path in the original metadata was
+incorrectly encoded, and had to be fixed in order to be acceptable utf-8,
+the original string is preserved in ``orig_path``. The reason to keep it
+is to be able to reproduce the info-section exactly, with the correct
+info-hash.

 ::

@ -863,6 +870,7 @@ before this one in the file list this one in the file list..
 		boost::filesystem::path path;
 		size_type offset;
 		size_type size;
+		boost::shared_ptr<boost::filesystem::path> orig_path;
 	};


--- a/docs/ubuntu_build_notes.html
+++ b/docs/ubuntu_build_notes.html
@ -47,13 +47,13 @@ cvs -d:pserver:anonymous&#64;boost.cvs.sourceforge.net:/cvsroot/boost login
 cvs -z3 -d:pserver:anonymous&#64;boost.cvs.sourceforge.net:/cvsroot/boost checkout boost
 cvs -d:pserver:anonymous&#64;boost.cvs.sourceforge.net:/cvsroot/boost logout

-cvs -d:pserver:anonym...&#64;libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent login
-cvs -z3 -d:pserver:anonym...&#64;libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent co -P libtorrent
-cvs -d:pserver:anonym...&#64;libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent logout
+cvs -d:pserver:anonymous&#64;libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent login
+cvs -z3 -d:pserver:anonymous&#64;libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent co -P libtorrent
+cvs -d:pserver:anonymous&#64;libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent logout

-cvs -d:pserver:anonym...&#64;asio.cvs.sourceforge.net:/cvsroot/asio login
-cvs -z3 -d:pserver:anonym...&#64;asio.cvs.sourceforge.net:/cvsroot/asio co -P asio
-cvs -d:pserver:anonym...&#64;asio.cvs.sourceforge.net:/cvsroot/asio login
+cvs -d:pserver:anonymous&#64;asio.cvs.sourceforge.net:/cvsroot/asio login
+cvs -z3 -d:pserver:anonymous&#64;asio.cvs.sourceforge.net:/cvsroot/asio co -P asio
+cvs -d:pserver:anonymous&#64;asio.cvs.sourceforge.net:/cvsroot/asio login
 </pre>
 </div>
 <div class="section" id="step-2-building-boost">
--- a/docs/ubuntu_build_notes.rst
+++ b/docs/ubuntu_build_notes.rst
@ -30,13 +30,13 @@ by executing the following commands::
   cvs -z3 -d:pserver:anonymous@boost.cvs.sourceforge.net:/cvsroot/boost checkout boost
   cvs -d:pserver:anonymous@boost.cvs.sourceforge.net:/cvsroot/boost logout

-   cvs -d:pserver:anonym...@libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent login
-   cvs -z3 -d:pserver:anonym...@libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent co -P libtorrent
-   cvs -d:pserver:anonym...@libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent logout
+   cvs -d:pserver:anonymous@libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent login
+   cvs -z3 -d:pserver:anonymous@libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent co -P libtorrent
+   cvs -d:pserver:anonymous@libtorrent.cvs.sourceforge.net:/cvsroot/libtorrent logout

-   cvs -d:pserver:anonym...@asio.cvs.sourceforge.net:/cvsroot/asio login
-   cvs -z3 -d:pserver:anonym...@asio.cvs.sourceforge.net:/cvsroot/asio co -P asio
-   cvs -d:pserver:anonym...@asio.cvs.sourceforge.net:/cvsroot/asio login
+   cvs -d:pserver:anonymous@asio.cvs.sourceforge.net:/cvsroot/asio login
+   cvs -z3 -d:pserver:anonymous@asio.cvs.sourceforge.net:/cvsroot/asio co -P asio
+   cvs -d:pserver:anonymous@asio.cvs.sourceforge.net:/cvsroot/asio login

 Step 2: Building boost
 ======================
--- a/include/libtorrent/torrent_info.hpp
+++ b/include/libtorrent/torrent_info.hpp
@ -45,6 +45,7 @@ POSSIBILITY OF SUCH DAMAGE.
 #include <boost/date_time/gregorian/gregorian_types.hpp>
 #include <boost/optional.hpp>
 #include <boost/filesystem/path.hpp>
+#include <boost/shared_ptr.hpp>

 #ifdef _MSC_VER
 #pragma warning(pop)
@ -65,6 +66,11 @@ namespace libtorrent
 		boost::filesystem::path path;
 		size_type offset; // the offset of this file inside the torrent
 		size_type size; // the size of this file
+		// if the path was incorrectly encoded, this is
+		// the origianal corrupt encoded string. It is
+		// preserved in order to be able to reproduce
+		// the correct info-hash
+		boost::shared_ptr<boost::filesystem::path> orig_path;
 	};

 	struct TORRENT_EXPORT file_slice
--- a/src/torrent_info.cpp
+++ b/src/torrent_info.cpp
@ -62,6 +62,98 @@ using namespace boost::filesystem;

 namespace
 {
+	void convert_to_utf8(std::string& str, unsigned char chr)
+	{
+		str += 0xc0 | ((chr & 0xff) >> 6);
+		str += 0x80 | (chr & 0x3f);
+	}
+
+	void verify_encoding(file_entry& target)
+	{
+		std::string tmp_path;
+		std::string file_path = target.path.string();
+		bool valid_encoding = true;
+		for (std::string::iterator i = file_path.begin()
+			, end(file_path.end()); i != end; ++i)
+		{
+			// valid ascii-character
+			if ((*i & 0x80) == 0)
+			{
+				tmp_path += *i;
+				continue;
+			}
+			
+			if (std::distance(i, end) < 2)
+			{
+				convert_to_utf8(tmp_path, *i);
+				valid_encoding = false;
+				continue;
+			}
+			
+			// valid 2-byte utf-8 character
+			if ((i[0] & 0xe0) == 0xc0
+				&& (i[1] & 0xc0) == 0x80)
+			{
+				tmp_path += i[0];
+				tmp_path += i[1];
+				i += 1;
+				continue;
+			}
+
+			if (std::distance(i, end) < 3)
+			{
+				convert_to_utf8(tmp_path, *i);
+				valid_encoding = false;
+				continue;
+			}
+
+			// valid 3-byte utf-8 character
+			if ((i[0] & 0xf0) == 0xe0
+				&& (i[1] & 0xc0) == 0x80
+				&& (i[2] & 0xc0) == 0x80)
+			{
+				tmp_path += i[0];
+				tmp_path += i[1];
+				tmp_path += i[2];
+				i += 2;
+				continue;
+			}
+
+			if (std::distance(i, end) < 4)
+			{
+				convert_to_utf8(tmp_path, *i);
+				valid_encoding = false;
+				continue;
+			}
+
+			// valid 4-byte utf-8 character
+			if ((i[0] & 0xf0) == 0xe0
+				&& (i[1] & 0xc0) == 0x80
+				&& (i[2] & 0xc0) == 0x80
+				&& (i[3] & 0xc0) == 0x80)
+			{
+				tmp_path += i[0];
+				tmp_path += i[1];
+				tmp_path += i[2];
+				tmp_path += i[3];
+				i += 3;
+				continue;
+			}
+
+			convert_to_utf8(tmp_path, *i);
+			valid_encoding = false;
+		}
+		// the encoding was not valid utf-8
+		// save the original encoding and replace the
+		// commonly used path with the correctly
+		// encoded string
+		if (!valid_encoding)
+		{
+			target.orig_path.reset(new path(target.path));
+			target.path = tmp_path;
+		}
+	}
+
 	void extract_single_file(const entry& dict, file_entry& target
 		, std::string const& root_dir)
 	{
@ -89,6 +181,7 @@ namespace
 			if (i->string() != "..")
 				target.path /= i->string();
 		}
+		verify_encoding(target);
 		if (target.path.is_complete()) throw std::runtime_error("torrent contains "
 			"a file with an absolute path: '"
 			+ target.path.native_file_string() + "'");
@ -501,7 +594,7 @@ namespace libtorrent
 				files = entry(entry::list_t);

 				for (std::vector<file_entry>::const_iterator i = m_files.begin();
-						i != m_files.end(); ++i)
+					i != m_files.end(); ++i)
 				{
 					files.list().push_back(entry(entry::dictionary_t));
 					entry& file_e = files.list().back();
@ -509,12 +602,14 @@ namespace libtorrent
 					entry& path_e = file_e["path"];
 					path_e = entry(entry::list_t);

-					fs::path const& file_path(i->path);
-					assert(file_path.has_branch_path());
-					assert(*file_path.begin() == m_name);
+					fs::path const* file_path;
+					if (i->orig_path) file_path = &(*i->orig_path);
+					else file_path = &i->path;
+					assert(file_path->has_branch_path());
+					assert(*file_path->begin() == m_name);

-					for (fs::path::iterator j = boost::next(file_path.begin());
-						j != file_path.end(); ++j)
+					for (fs::path::iterator j = boost::next(file_path->begin());
+						j != file_path->end(); ++j)
 					{
 						path_e.list().push_back(entry(*j));
 					}