From 3460f203f3f792fb371c7bc4232ef0c46465881d Mon Sep 17 00:00:00 2001
From: arvidn <arvid@cs.umu.se>
Date: Sat, 19 Dec 2015 18:28:12 -0500
Subject: [PATCH] extend testing of sanitize_append_path_element

---
 include/libtorrent/torrent_info.hpp |   3 +-
 src/torrent_info.cpp                |  76 +++++---
 test/test_torrent_info.cpp          | 283 ++++++++++++++++++++--------
 3 files changed, 249 insertions(+), 113 deletions(-)

diff --git a/include/libtorrent/torrent_info.hpp b/include/libtorrent/torrent_info.hpp
index dc72e1bf1..6f0edb335 100644
--- a/include/libtorrent/torrent_info.hpp
+++ b/include/libtorrent/torrent_info.hpp
@@ -64,8 +64,7 @@ namespace libtorrent
 	// internal, exposed for the unit test
 	TORRENT_EXTRA_EXPORT void sanitize_append_path_element(std::string& path
 		, char const* element, int element_len);
-	TORRENT_EXTRA_EXPORT bool verify_encoding(std::string& target
-		, bool fix_paths = false);
+	TORRENT_EXTRA_EXPORT bool verify_encoding(std::string& target);
 
 	// the web_seed_entry holds information about a web seed (also known
 	// as URL seed or HTTP seed). It is essentially a URL with some state
diff --git a/src/torrent_info.cpp b/src/torrent_info.cpp
index 9ef774274..f58db0e09 100644
--- a/src/torrent_info.cpp
+++ b/src/torrent_info.cpp
@@ -94,11 +94,8 @@ namespace libtorrent
 
 	} // anonymous namespace
 
-	// fixes invalid UTF-8 sequences and
-	// replaces characters that are invalid
-	// in paths
-	TORRENT_EXTRA_EXPORT bool verify_encoding(std::string& target
-		, bool fix_paths)
+	// fixes invalid UTF-8 sequences
+	TORRENT_EXTRA_EXPORT bool verify_encoding(std::string& target)
 	{
 		if (target.empty()) return true;
 
@@ -145,14 +142,6 @@ namespace libtorrent
 				valid_encoding = false;
 			}
 
-			// if fix paths is true, also replace characters that are invalid
-			// in filenames
-			if (fix_paths && codepoint < 0x7f && !valid_path_character(codepoint))
-			{
-				codepoint = '_';
-				valid_encoding = false;
-			}
-
 			// encode codepoint into utf-8
 			cp = &codepoint;
 			UTF8 sequence[5];
@@ -250,7 +239,14 @@ namespace libtorrent
 			if ((element[i] & 0x80) == 0)
 			{
 				// 1 byte
-				path += element[i];
+				if (valid_path_character(element[i]))
+				{
+					path += element[i];
+				}
+				else
+				{
+					path += '_';
+				}
 				last_len = 1;
 			}
 			else if ((element[i] & 0xe0) == 0xc0)
@@ -259,7 +255,13 @@ namespace libtorrent
 				if (element_len - i < 2
 					|| (element[i+1] & 0xc0) != 0x80)
 				{
-					path += '?';
+					path += '_';
+					last_len = 1;
+				}
+				else if ((element[i] & 0x1f) == 0)
+				{
+					// overlong sequences are invalid
+					path += '_';
 					last_len = 1;
 				}
 				else
@@ -278,7 +280,13 @@ namespace libtorrent
 					|| (element[i+2] & 0xc0) != 0x80
 					)
 				{
-					path += '?';
+					path += '_';
+					last_len = 1;
+				}
+				else if ((element[i] & 0x0f) == 0)
+				{
+					// overlong sequences are invalid
+					path += '_';
 					last_len = 1;
 				}
 				else
@@ -299,7 +307,14 @@ namespace libtorrent
 					|| (element[i+3] & 0xc0) != 0x80
 					)
 				{
-					path += '?';
+					path += '_';
+					last_len = 1;
+				}
+				else if ((element[i] & 0x07) == 0
+					&& (element[i+1] & 0x3f) == 0)
+				{
+					// overlong sequences are invalid
+					path += '_';
 					last_len = 1;
 				}
 				else
@@ -312,6 +327,11 @@ namespace libtorrent
 				}
 				i += 3;
 			}
+			else
+			{
+				path += '_';
+				last_len = 1;
+			}
 
 			added += last_len;
 			++unicode_chars;
@@ -347,18 +367,20 @@ namespace libtorrent
 			return;
 		}
 
-		if (added == 0 && added_separator)
-		{
-			// remove the separator added at the beginning
-			path.erase(path.end()-1);
-			return;
-		}
-
 		// remove trailing spaces and dots. These aren't allowed in filenames on windows
 		for (int i = path.size() - 1; i >= 0; --i)
 		{
 			if (path[i] != ' ' && path[i] != '.') break;
 			path.resize(i);
+			--added;
+			TORRENT_ASSERT(added >= 0);
+		}
+
+		if (added == 0 && added_separator)
+		{
+			// remove the separator added at the beginning
+			path.erase(path.end()-1);
+			return;
 		}
 
 		if (path.empty()) path = "_";
@@ -405,8 +427,6 @@ namespace libtorrent
 			filename = p.string_ptr() + info_ptr_diff;
 			filename_len = p.string_length();
 			sanitize_append_path_element(path, p.string_ptr(), p.string_length());
-
-//			if (path.empty()) path = to_hex(files.info_hash().to_string());
 		}
 		else
 		{
@@ -451,7 +471,7 @@ namespace libtorrent
 		bdecode_node attr = dict.dict_find_string("attr");
 		if (attr)
 		{
-			for (int i = 0; i < attr.string_length(); ++i)	
+			for (int i = 0; i < attr.string_length(); ++i)
 			{
 				switch (attr.string_ptr()[i])
 				{
@@ -1604,7 +1624,7 @@ namespace libtorrent
 		m_comment = torrent_file.dict_find_string_value("comment.utf-8");
 		if (m_comment.empty()) m_comment = torrent_file.dict_find_string_value("comment");
 		verify_encoding(m_comment);
-	
+
 		m_created_by = torrent_file.dict_find_string_value("created by.utf-8");
 		if (m_created_by.empty()) m_created_by = torrent_file.dict_find_string_value("created by");
 		verify_encoding(m_created_by);
diff --git a/test/test_torrent_info.cpp b/test/test_torrent_info.cpp
index b6d9e0ff0..f24c0035d 100644
--- a/test/test_torrent_info.cpp
+++ b/test/test_torrent_info.cpp
@@ -161,7 +161,6 @@ test_failing_torrent_t test_error_torrents[] =
 // TODO: torrent with 'l' (symlink) attribute
 // TODO: creating a merkle torrent (torrent_info::build_merkle_list)
 // TODO: torrent with multiple trackers in multiple tiers, making sure we shuffle them (how do you test shuffling?, load it multiple times and make sure it's in different order at least once)
-// TODO: sanitize_append_path_element with all kinds of UTF-8 sequences, including invalid ones
 // TODO: torrents with a missing name
 // TODO: torrents with a zero-length name
 // TODO: torrents with a merkle tree and add_merkle_nodes
@@ -285,6 +284,11 @@ TORRENT_TEST(sanitize_path)
 	TEST_EQUAL(path, "a/c");
 #endif
 
+	path.clear();
+	sanitize_append_path_element(path, "a", 1);
+	sanitize_append_path_element(path, "..", 2);
+	TEST_EQUAL(path, "a");
+
 	path.clear();
 	sanitize_append_path_element(path, "/..", 3);
 	sanitize_append_path_element(path, ".", 1);
@@ -328,6 +332,201 @@ TORRENT_TEST(sanitize_path)
 	TEST_EQUAL(path, "c/c");
 #endif
 
+	path.clear();
+	sanitize_append_path_element(path, "\b", 1);
+	TEST_EQUAL(path, "_");
+
+	path.clear();
+	sanitize_append_path_element(path, "\b", 1);
+	sanitize_append_path_element(path, "filename", 8);
+#ifdef TORRENT_WINDOWS
+	TEST_EQUAL(path, "_\\filename");
+#else
+	TEST_EQUAL(path, "_/filename");
+#endif
+
+	path.clear();
+	sanitize_append_path_element(path, "filename", 8);
+	sanitize_append_path_element(path, "\b", 1);
+#ifdef TORRENT_WINDOWS
+	TEST_EQUAL(path, "filename\\_");
+#else
+	TEST_EQUAL(path, "filename/_");
+#endif
+
+	path.clear();
+	sanitize_append_path_element(path, "abc", 3);
+	sanitize_append_path_element(path, "", 0);
+#ifdef TORRENT_WINDOWS
+	TEST_EQUAL(path, "abc\\_");
+#else
+	TEST_EQUAL(path, "abc/_");
+#endif
+
+	path.clear();
+	sanitize_append_path_element(path, "abc", 3);
+	sanitize_append_path_element(path, "   ", 3);
+	TEST_EQUAL(path, "abc");
+
+	path.clear();
+	sanitize_append_path_element(path, "", 0);
+	sanitize_append_path_element(path, "abc", 3);
+#ifdef TORRENT_WINDOWS
+	TEST_EQUAL(path, "_\\abc");
+#else
+	TEST_EQUAL(path, "_/abc");
+#endif
+
+	path.clear();
+	sanitize_append_path_element(path, "\b?filename=4", 12);
+#ifdef TORRENT_WINDOWS
+	TEST_EQUAL(path, "__filename=4");
+#else
+	TEST_EQUAL(path, "_?filename=4");
+#endif
+
+	path.clear();
+	sanitize_append_path_element(path, "filename=4", 10);
+	TEST_EQUAL(path, "filename=4");
+
+	// valid 2-byte sequence
+	path.clear();
+	sanitize_append_path_element(path, "filename\xc2\xa1", 10);
+	TEST_EQUAL(path, "filename\xc2\xa1");
+
+	// truncated 2-byte sequence
+	path.clear();
+	sanitize_append_path_element(path, "filename\xc2", 9);
+	TEST_EQUAL(path, "filename_");
+
+	// valid 3-byte sequence
+	path.clear();
+	sanitize_append_path_element(path, "filename\xe2\x9f\xb9", 11);
+	TEST_EQUAL(path, "filename\xe2\x9f\xb9");
+
+	// truncated 3-byte sequence
+	path.clear();
+	sanitize_append_path_element(path, "filename\xe2\x9f", 10);
+	TEST_EQUAL(path, "filename_");
+
+	// truncated 3-byte sequence
+	path.clear();
+	sanitize_append_path_element(path, "filename\xe2", 9);
+	TEST_EQUAL(path, "filename_");
+
+	// valid 4-byte sequence
+	path.clear();
+	sanitize_append_path_element(path, "filename\xf0\x9f\x92\x88", 12);
+	TEST_EQUAL(path, "filename\xf0\x9f\x92\x88");
+
+	// truncated 4-byte sequence
+	path.clear();
+	sanitize_append_path_element(path, "filename\xf0\x9f\x92", 11);
+	TEST_EQUAL(path, "filename_");
+
+	// 5-byte utf-8 sequence (not allowed)
+	path.clear();
+	sanitize_append_path_element(path, "filename\xf8\x9f\x9f\x9f\x9f" "foobar", 19);
+	TEST_EQUAL(path, "filename_____foobar");
+
+	// redundant (overlong) 2-byte sequence
+	// ascii code 0x2e encoded with a leading 0
+	path.clear();
+	sanitize_append_path_element(path, "filename\xc0\xae", 10);
+	TEST_EQUAL(path, "filename_");
+
+	// redundant (overlong) 3-byte sequence
+	// ascii code 0x2e encoded with two leading 0s
+	path.clear();
+	sanitize_append_path_element(path, "filename\xe0\x80\xae", 11);
+	TEST_EQUAL(path, "filename_");
+
+	// redundant (overlong) 4-byte sequence
+	// ascii code 0x2e encoded with three leading 0s
+	path.clear();
+	sanitize_append_path_element(path, "filename\xf0\x80\x80\xae", 12);
+	TEST_EQUAL(path, "filename_");
+}
+
+TORRENT_TEST(verify_encoding)
+{
+	// verify_encoding
+	std::string test = "\b?filename=4";
+	TEST_CHECK(verify_encoding(test));
+	TEST_CHECK(test == "\b?filename=4");
+
+	test = "filename=4";
+	TEST_CHECK(verify_encoding(test));
+	TEST_CHECK(test == "filename=4");
+
+	// valid 2-byte sequence
+	test = "filename\xc2\xa1";
+	TEST_CHECK(verify_encoding(test));
+	fprintf(stderr, "%s\n", test.c_str());
+	TEST_CHECK(test == "filename\xc2\xa1");
+
+	// truncated 2-byte sequence
+	test = "filename\xc2";
+	TEST_CHECK(!verify_encoding(test));
+	fprintf(stderr, "%s\n", test.c_str());
+	TEST_CHECK(test == "filename_");
+
+	// valid 3-byte sequence
+	test = "filename\xe2\x9f\xb9";
+	TEST_CHECK(verify_encoding(test));
+	fprintf(stderr, "%s\n", test.c_str());
+	TEST_CHECK(test == "filename\xe2\x9f\xb9");
+
+	// truncated 3-byte sequence
+	test = "filename\xe2\x9f";
+	TEST_CHECK(!verify_encoding(test));
+	fprintf(stderr, "%s\n", test.c_str());
+	TEST_CHECK(test == "filename_");
+
+	// truncated 3-byte sequence
+	test = "filename\xe2";
+	TEST_CHECK(!verify_encoding(test));
+	fprintf(stderr, "%s\n", test.c_str());
+	TEST_CHECK(test == "filename_");
+
+	// valid 4-byte sequence
+	test = "filename\xf0\x9f\x92\x88";
+	TEST_CHECK(verify_encoding(test));
+	fprintf(stderr, "%s\n", test.c_str());
+	TEST_CHECK(test == "filename\xf0\x9f\x92\x88");
+
+	// truncated 4-byte sequence
+	test = "filename\xf0\x9f\x92";
+	TEST_CHECK(!verify_encoding(test));
+	fprintf(stderr, "%s\n", test.c_str());
+	TEST_CHECK(test == "filename_");
+
+	// 5-byte utf-8 sequence (not allowed)
+	test = "filename\xf8\x9f\x9f\x9f\x9f""foobar";
+	TEST_CHECK(!verify_encoding(test));
+	fprintf(stderr, "%s\n", test.c_str());
+	TEST_CHECK(test == "filename_____foobar");
+
+	// redundant (overlong) 2-byte sequence
+	// ascii code 0x2e encoded with a leading 0
+	test = "filename\xc0\xae";
+	TEST_CHECK(!verify_encoding(test));
+	fprintf(stderr, "%s\n", test.c_str());
+	TEST_CHECK(test == "filename__");
+
+	// redundant (overlong) 3-byte sequence
+	// ascii code 0x2e encoded with two leading 0s
+	test = "filename\xe0\x80\xae";
+	TEST_CHECK(!verify_encoding(test));
+	fprintf(stderr, "%s\n", test.c_str());
+	TEST_CHECK(test == "filename___");
+
+	// redundant (overlong) 4-byte sequence
+	// ascii code 0x2e encoded with three leading 0s
+	test = "filename\xf0\x80\x80\xae";
+	TEST_CHECK(!verify_encoding(test));
+	fprintf(stderr, "%s\n", test.c_str());
+	TEST_CHECK(test == "filename____");
 }
 
 TORRENT_TEST(parse)
@@ -375,88 +574,6 @@ TORRENT_TEST(parse)
 	std::cerr << ti3.name() << std::endl;
 	TEST_EQUAL(ti3.name(), "test2..test3.......test4");
 
-	// verify_encoding
-	std::string test = "\b?filename=4";
-	TEST_CHECK(!verify_encoding(test, true));
-#ifdef TORRENT_WINDOWS
-	TEST_CHECK(test == "__filename=4");
-#else
-	TEST_CHECK(test == "_?filename=4");
-#endif
-
-	test = "filename=4";
-	TEST_CHECK(verify_encoding(test, true));
-	TEST_CHECK(test == "filename=4");
-
-	// valid 2-byte sequence
-	test = "filename\xc2\xa1";
-	TEST_CHECK(verify_encoding(test, true));
-	fprintf(stderr, "%s\n", test.c_str());
-	TEST_CHECK(test == "filename\xc2\xa1");
-
-	// truncated 2-byte sequence
-	test = "filename\xc2";
-	TEST_CHECK(!verify_encoding(test, true));
-	fprintf(stderr, "%s\n", test.c_str());
-	TEST_CHECK(test == "filename_");
-
-	// valid 3-byte sequence
-	test = "filename\xe2\x9f\xb9";
-	TEST_CHECK(verify_encoding(test, true));
-	fprintf(stderr, "%s\n", test.c_str());
-	TEST_CHECK(test == "filename\xe2\x9f\xb9");
-
-	// truncated 3-byte sequence
-	test = "filename\xe2\x9f";
-	TEST_CHECK(!verify_encoding(test, true));
-	fprintf(stderr, "%s\n", test.c_str());
-	TEST_CHECK(test == "filename_");
-
-	// truncated 3-byte sequence
-	test = "filename\xe2";
-	TEST_CHECK(!verify_encoding(test, true));
-	fprintf(stderr, "%s\n", test.c_str());
-	TEST_CHECK(test == "filename_");
-
-	// valid 4-byte sequence
-	test = "filename\xf0\x9f\x92\x88";
-	TEST_CHECK(verify_encoding(test, true));
-	fprintf(stderr, "%s\n", test.c_str());
-	TEST_CHECK(test == "filename\xf0\x9f\x92\x88");
-
-	// truncated 4-byte sequence
-	test = "filename\xf0\x9f\x92";
-	TEST_CHECK(!verify_encoding(test, true));
-	fprintf(stderr, "%s\n", test.c_str());
-	TEST_CHECK(test == "filename_");
-
-	// 5-byte utf-8 sequence (not allowed)
-	test = "filename\xf8\x9f\x9f\x9f\x9f""foobar";
-	TEST_CHECK(!verify_encoding(test, true));
-	fprintf(stderr, "%s\n", test.c_str());
-	TEST_CHECK(test == "filename_____foobar");
-
-	// redundant (overlong) 2-byte sequence
-	// ascii code 0x2e encoded with a leading 0
-	test = "filename\xc0\xae";
-	TEST_CHECK(!verify_encoding(test, true));
-	fprintf(stderr, "%s\n", test.c_str());
-	TEST_CHECK(test == "filename__");
-
-	// redundant (overlong) 3-byte sequence
-	// ascii code 0x2e encoded with two leading 0s
-	test = "filename\xe0\x80\xae";
-	TEST_CHECK(!verify_encoding(test, true));
-	fprintf(stderr, "%s\n", test.c_str());
-	TEST_CHECK(test == "filename___");
-
-	// redundant (overlong) 4-byte sequence
-	// ascii code 0x2e encoded with three leading 0s
-	test = "filename\xf0\x80\x80\xae";
-	TEST_CHECK(!verify_encoding(test, true));
-	fprintf(stderr, "%s\n", test.c_str());
-	TEST_CHECK(test == "filename____");
-
 	std::string root_dir = parent_path(current_working_directory());
 	for (int i = 0; i < int(sizeof(test_torrents)/sizeof(test_torrents[0])); ++i)
 	{