merge RC_1_1 into master

2017-05-15 09:08:57 -04:00 · 2017-05-15 09:08:57 -04:00 · 73a3e390b5
parent 17bb00c042 56d5d795bf
commit 73a3e390b5
9 changed files with 124 additions and 113 deletions
--- a/2
+++ b/2
@ -73,6 +73,8 @@
 	* require C++11 to build libtorrent


+	* improve path sanitization (filter unicode text direction characters)
+	* deprecate partial_piece_info::piece_state
 	* bind upnp requests to correct local address
 	* save resume data when removing web seeds
 	* fix proxying of https connections
--- a/include/libtorrent/ConvertUTF.h
+++ b/include/libtorrent/ConvertUTF.h
@ -134,6 +134,11 @@ TORRENT_EXTRA_EXPORT ConversionResult ConvertUTF32toUTF16 (
 TORRENT_EXTRA_EXPORT Boolean isLegalUTF8Sequence(const UTF8 *source,
 		const UTF8 *sourceEnd);

+TORRENT_EXTRA_EXPORT Boolean isLegalUTF8(const UTF8 *source, int length);
+
+extern const char trailingBytesForUTF8[256];
+extern const UTF32 offsetsFromUTF8[6];
+
 #ifdef __cplusplus
 }
 #endif
--- a/include/libtorrent/torrent_handle.hpp
+++ b/include/libtorrent/torrent_handle.hpp
@ -177,6 +177,7 @@ namespace libtorrent { namespace aux {
 		//	get_download_queue() is called, it will be invalidated.
 		block_info* blocks;

+#ifndef TORRENT_NO_DEPRECATE
 		// the speed classes. These may be used by the piece picker to
 		// coalesce requests of similar download rates
 		enum state_t { none, slow, medium, fast };
@ -193,7 +194,12 @@ namespace libtorrent { namespace aux {
 		// downloaded pieces down. Pieces set to ``none`` can be converted into
 		// any of ``fast``, ``medium`` or ``slow`` as soon as a peer want to
 		// download from it.
-		state_t piece_state;
+		state_t TORRENT_DEPRECATED_MEMBER piece_state;
+#else
+		// hidden
+		enum deprecated_state_t { none, slow, medium, fast };
+		deprecated_state_t deprecated_piece_state;
+#endif
 	};

 	// for std::hash (and to support using this type in unordered_map etc.)
--- a/include/libtorrent/utf8.hpp
+++ b/include/libtorrent/utf8.hpp
@ -39,6 +39,7 @@ POSSIBILITY OF SUCH DAMAGE.
 // convert_to_native and convert_from_native
 #if TORRENT_USE_WSTRING || defined TORRENT_WINDOWS

+#include <cstdint>
 #include <string>
 #include <cwchar>

@ -78,6 +79,10 @@ namespace libtorrent {
 	TORRENT_EXTRA_EXPORT std::wstring utf8_wchar(string_view utf8);
 	TORRENT_EXTRA_EXPORT std::string wchar_utf8(wstring_view wide, error_code& ec);
 	TORRENT_EXTRA_EXPORT std::string wchar_utf8(wstring_view wide);
+
+	// TODO: 3 take a string_view here
+	TORRENT_EXTRA_EXPORT std::pair<std::int32_t, int>
+		parse_utf8_codepoint(char const* str, int len);
 }
 #endif // !BOOST_NO_STD_WSTRING

--- a/src/ConvertUTF.cpp
+++ b/src/ConvertUTF.cpp
@ -171,7 +171,7 @@ if (result == sourceIllegal) {
 * left as-is for anyone who may want to do such conversion, which was
 * allowed in earlier algorithms.
 */
-static const char trailingBytesForUTF8[256] = {
+const char trailingBytesForUTF8[256] = {
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@ -187,7 +187,7 @@ static const char trailingBytesForUTF8[256] = {
 * This table contains as many values as there might be trailing bytes
 * in a UTF-8 sequence.
 */
-static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
+const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
 	0x03C82080UL, 0xFA082080UL, 0x82082080UL };

 /*
@ -292,7 +292,7 @@ ConversionResult ConvertUTF16toUTF8 (
 * definition of UTF-8 goes up to 4-byte sequences.
 */

-static Boolean isLegalUTF8(const UTF8 *source, int length) {
+Boolean isLegalUTF8(const UTF8 *source, int length) {
 	UTF8 a;
 	const UTF8 *srcptr = source+length;
 	switch (length) {
--- a/src/torrent.cpp
+++ b/src/torrent.cpp
@ -6283,6 +6283,11 @@ namespace libtorrent {
 			pi.finished = int(i->finished);
 			pi.writing = int(i->writing);
 			pi.requested = int(i->requested);
+#ifndef TORRENT_NO_DEPRECATE
+			pi.piece_state = partial_piece_info::none;
+#else
+			pi.deprecated_piece_state = partial_piece_info::none;
+#endif
 			TORRENT_ASSERT(counter * blocks_per_piece + pi.blocks_in_piece <= int(blk.size()));
 			pi.blocks = &blk[std::size_t(counter * blocks_per_piece)];
 			int const piece_size = torrent_file().piece_size(i->index);
--- a/src/torrent_info.cpp
+++ b/src/torrent_info.cpp
@ -60,24 +60,44 @@ POSSIBILITY OF SUCH DAMAGE.
 #endif

 #include <unordered_set>
+#include <cstdint>
 #include <iterator>
 #include <algorithm>
 #include <set>
 #include <ctime>
+#include <array>

 namespace libtorrent {

 	namespace {

-	bool valid_path_character(char c)
+	bool valid_path_character(std::int32_t const c)
 	{
 #ifdef TORRENT_WINDOWS
 		static const char invalid_chars[] = "?<>\"|\b*:";
 #else
 		static const char invalid_chars[] = "";
 #endif
-		if (c >= 0 && c < 32) return false;
-		return std::strchr(invalid_chars, c) == nullptr;
+		if (c < 32) return false;
+		if (c > 127) return true;
+		return std::strchr(invalid_chars, static_cast<char>(c)) == nullptr;
+	}
+
+	bool filter_path_character(std::int32_t const c)
+	{
+		// these unicode characters change the writing writing direction of the
+		// string and can be used for attacks:
+		// https://security.stackexchange.com/questions/158802/how-can-this-executable-have-an-avi-extension
+		static const std::array<std::int32_t, 7> bad_cp = {{0x202a, 0x202b, 0x202c, 0x202d, 0x202e, 0x200e, 0x200f}};
+		if (std::find(bad_cp.begin(), bad_cp.end(), c) != bad_cp.end()) return true;
+
+#ifdef TORRENT_WINDOWS
+		static const char invalid_chars[] = "/\\:";
+#else
+		static const char invalid_chars[] = "/\\";
+#endif
+		if (c > 127) return false;
+		return std::strchr(invalid_chars, static_cast<char>(c)) != NULL;
 	}

 	} // anonymous namespace
@ -209,118 +229,36 @@ namespace libtorrent {
 		// the number of dots we've added
 		char num_dots = 0;
 		bool found_extension = false;
-		for (std::size_t i = 0; i < element.size(); ++i)
+
+		int seq_len = 0;
+		for (std::size_t i = 0; i < element.size(); i += std::size_t(seq_len))
+		{
+			std::int32_t code_point;
+			std::tie(code_point, seq_len) = parse_utf8_codepoint(element.data() + i
+				, int(element.size() - i));
+
+			if (code_point >= 0 && filter_path_character(code_point))
 			{
-			if (element[i] == '/'
-				|| element[i] == '\\'
-#ifdef TORRENT_WINDOWS
-				|| element[i] == ':'
-#endif
-				)
 				continue;
-
-			if (element[i] == '.') ++num_dots;
-
-			int last_len = 0;
-
-			if ((element[i] & 0x80) == 0)
-			{
-				// 1 byte
-				if (valid_path_character(element[i]))
-				{
-					path += element[i];
-				}
-				else
-				{
-					path += '_';
-				}
-				last_len = 1;
-			}
-			else if ((element[i] & 0xe0) == 0xc0)
-			{
-				// 2 bytes
-				if (element.size() - i < 2
-					|| (element[i + 1] & 0xc0) != 0x80)
-				{
-					path += '_';
-					last_len = 1;
-				}
-				else if ((element[i] & 0x1f) == 0)
-				{
-					// overlong sequences are invalid
-					path += '_';
-					last_len = 1;
-				}
-				else
-				{
-					path += element[i];
-					path += element[i + 1];
-					last_len = 2;
-				}
-				i += 1;
-			}
-			else if ((element[i] & 0xf0) == 0xe0)
-			{
-				// 3 bytes
-				if (element.size() - i < 3
-					|| (element[i + 1] & 0xc0) != 0x80
-					|| (element[i + 2] & 0xc0) != 0x80
-					)
-				{
-					path += '_';
-					last_len = 1;
-				}
-				else if ((element[i] & 0x0f) == 0)
-				{
-					// overlong sequences are invalid
-					path += '_';
-					last_len = 1;
-				}
-				else
-				{
-					path += element[i];
-					path += element[i + 1];
-					path += element[i + 2];
-					last_len = 3;
-				}
-				i += 2;
-			}
-			else if ((element[i] & 0xf8) == 0xf0)
-			{
-				// 4 bytes
-				if (element.size() - i < 4
-					|| (element[i + 1] & 0xc0) != 0x80
-					|| (element[i + 2] & 0xc0) != 0x80
-					|| (element[i + 3] & 0xc0) != 0x80
-					)
-				{
-					path += '_';
-					last_len = 1;
-				}
-				else if ((element[i] & 0x07) == 0
-					&& (element[i + 1] & 0x3f) == 0)
-				{
-					// overlong sequences are invalid
-					path += '_';
-					last_len = 1;
-				}
-				else
-				{
-					path += element[i];
-					path += element[i + 1];
-					path += element[i + 2];
-					path += element[i + 3];
-					last_len = 4;
-				}
-				i += 3;
-			}
-			else
-			{
-				path += '_';
-				last_len = 1;
 			}

-			added += last_len;
+			if (code_point < 0
+				|| !valid_path_character(code_point))
+			{
+				// invalid utf8 sequence, replace with "_"
+				path += '_';
+				++added;
+				++unicode_chars;
+				continue;
+			}
+
+			// validation passed, add it to the output string
+			for (std::size_t k = i; k < i + std::size_t(seq_len); ++k)
+				path.push_back(element[k]);
+
+			if (code_point == '.') ++num_dots;
+
+			added += seq_len;
 			++unicode_chars;

 			// any given path element should not
--- a/src/utf8.cpp
+++ b/src/utf8.cpp
@ -268,6 +268,39 @@ namespace {
 		if (ec) aux::throw_ex<system_error>(ec);
 		return ret;
 	}
+
+	// returns the unicode codepoint and the number of bytes of the utf8 sequence
+	// that was parsed. The codepoint is -1 if it's invalid
+	std::pair<std::int32_t, int> parse_utf8_codepoint(char const* str, int const len)
+	{
+		int const sequence_len = trailingBytesForUTF8[static_cast<std::uint8_t>(*str)] + 1;
+		if (sequence_len > len) return std::make_pair(-1, len);
+
+		if (sequence_len > 4)
+		{
+			return std::make_pair(-1, sequence_len);
+		}
+
+		if (!isLegalUTF8(reinterpret_cast<UTF8 const*>(str), sequence_len))
+		{
+			return std::make_pair(-1, sequence_len);
+		}
+
+		std::uint32_t ch = 0;
+		for (int i = 0; i < sequence_len; ++i)
+		{
+			ch <<= 6;
+			ch += static_cast<std::uint8_t>(str[i]);
+		}
+		ch -= offsetsFromUTF8[sequence_len-1];
+
+		if (ch > 0x7fffffff)
+		{
+			return std::make_pair(-1, sequence_len);
+		}
+
+		return std::make_pair(static_cast<std::int32_t>(ch), sequence_len);
+	}
 }

 #ifdef __clang__
--- a/test/test_torrent_info.cpp
+++ b/test/test_torrent_info.cpp
@ -493,7 +493,7 @@ TORRENT_TEST(sanitize_path)
 	// 5-byte utf-8 sequence (not allowed)
 	path.clear();
 	sanitize_append_path_element(path, "filename\xf8\x9f\x9f\x9f\x9f" "foobar");
-	TEST_EQUAL(path, "filename_____foobar");
+	TEST_EQUAL(path, "filename_foobar");

 	// redundant (overlong) 2-byte sequence
 	// ascii code 0x2e encoded with a leading 0
@ -512,6 +512,23 @@ TORRENT_TEST(sanitize_path)
 	path.clear();
 	sanitize_append_path_element(path, "filename\xf0\x80\x80\xae");
 	TEST_EQUAL(path, "filename_");
+
+	// a filename where every character is filtered is not replaced by an understcore
+	path.clear();
+	sanitize_append_path_element(path, "//\\");
+	TEST_EQUAL(path, "");
+
+	// make sure suspicious unicode characters are filtered out
+	path.clear();
+	// that's utf-8 for U+200e LEFT-TO-RIGHT MARK
+	sanitize_append_path_element(path, "foo\xe2\x80\x8e" "bar");
+	TEST_EQUAL(path, "foobar");
+
+	// make sure suspicious unicode characters are filtered out
+	path.clear();
+	// that's utf-8 for U+202b RIGHT-TO-LEFT EMBEDDING
+	sanitize_append_path_element(path, "foo\xe2\x80\xab" "bar");
+	TEST_EQUAL(path, "foobar");
 }

 TORRENT_TEST(verify_encoding)