extend testing of sanitize_append_path_element

This commit is contained in:
arvidn 2015-12-19 18:28:12 -05:00
parent 5c44bf1b36
commit 3460f203f3
3 changed files with 249 additions and 113 deletions

View File

@ -64,8 +64,7 @@ namespace libtorrent
// internal, exposed for the unit test
TORRENT_EXTRA_EXPORT void sanitize_append_path_element(std::string& path
, char const* element, int element_len);
TORRENT_EXTRA_EXPORT bool verify_encoding(std::string& target
, bool fix_paths = false);
TORRENT_EXTRA_EXPORT bool verify_encoding(std::string& target);
// the web_seed_entry holds information about a web seed (also known
// as URL seed or HTTP seed). It is essentially a URL with some state

View File

@ -94,11 +94,8 @@ namespace libtorrent
} // anonymous namespace
// fixes invalid UTF-8 sequences and
// replaces characters that are invalid
// in paths
TORRENT_EXTRA_EXPORT bool verify_encoding(std::string& target
, bool fix_paths)
// fixes invalid UTF-8 sequences
TORRENT_EXTRA_EXPORT bool verify_encoding(std::string& target)
{
if (target.empty()) return true;
@ -145,14 +142,6 @@ namespace libtorrent
valid_encoding = false;
}
// if fix paths is true, also replace characters that are invalid
// in filenames
if (fix_paths && codepoint < 0x7f && !valid_path_character(codepoint))
{
codepoint = '_';
valid_encoding = false;
}
// encode codepoint into utf-8
cp = &codepoint;
UTF8 sequence[5];
@ -250,7 +239,14 @@ namespace libtorrent
if ((element[i] & 0x80) == 0)
{
// 1 byte
path += element[i];
if (valid_path_character(element[i]))
{
path += element[i];
}
else
{
path += '_';
}
last_len = 1;
}
else if ((element[i] & 0xe0) == 0xc0)
@ -259,7 +255,13 @@ namespace libtorrent
if (element_len - i < 2
|| (element[i+1] & 0xc0) != 0x80)
{
path += '?';
path += '_';
last_len = 1;
}
else if ((element[i] & 0x1f) == 0)
{
// overlong sequences are invalid
path += '_';
last_len = 1;
}
else
@ -278,7 +280,13 @@ namespace libtorrent
|| (element[i+2] & 0xc0) != 0x80
)
{
path += '?';
path += '_';
last_len = 1;
}
else if ((element[i] & 0x0f) == 0)
{
// overlong sequences are invalid
path += '_';
last_len = 1;
}
else
@ -299,7 +307,14 @@ namespace libtorrent
|| (element[i+3] & 0xc0) != 0x80
)
{
path += '?';
path += '_';
last_len = 1;
}
else if ((element[i] & 0x07) == 0
&& (element[i+1] & 0x3f) == 0)
{
// overlong sequences are invalid
path += '_';
last_len = 1;
}
else
@ -312,6 +327,11 @@ namespace libtorrent
}
i += 3;
}
else
{
path += '_';
last_len = 1;
}
added += last_len;
++unicode_chars;
@ -347,18 +367,20 @@ namespace libtorrent
return;
}
if (added == 0 && added_separator)
{
// remove the separator added at the beginning
path.erase(path.end()-1);
return;
}
// remove trailing spaces and dots. These aren't allowed in filenames on windows
for (int i = path.size() - 1; i >= 0; --i)
{
if (path[i] != ' ' && path[i] != '.') break;
path.resize(i);
--added;
TORRENT_ASSERT(added >= 0);
}
if (added == 0 && added_separator)
{
// remove the separator added at the beginning
path.erase(path.end()-1);
return;
}
if (path.empty()) path = "_";
@ -405,8 +427,6 @@ namespace libtorrent
filename = p.string_ptr() + info_ptr_diff;
filename_len = p.string_length();
sanitize_append_path_element(path, p.string_ptr(), p.string_length());
// if (path.empty()) path = to_hex(files.info_hash().to_string());
}
else
{
@ -451,7 +471,7 @@ namespace libtorrent
bdecode_node attr = dict.dict_find_string("attr");
if (attr)
{
for (int i = 0; i < attr.string_length(); ++i)
for (int i = 0; i < attr.string_length(); ++i)
{
switch (attr.string_ptr()[i])
{
@ -1604,7 +1624,7 @@ namespace libtorrent
m_comment = torrent_file.dict_find_string_value("comment.utf-8");
if (m_comment.empty()) m_comment = torrent_file.dict_find_string_value("comment");
verify_encoding(m_comment);
m_created_by = torrent_file.dict_find_string_value("created by.utf-8");
if (m_created_by.empty()) m_created_by = torrent_file.dict_find_string_value("created by");
verify_encoding(m_created_by);

View File

@ -161,7 +161,6 @@ test_failing_torrent_t test_error_torrents[] =
// TODO: torrent with 'l' (symlink) attribute
// TODO: creating a merkle torrent (torrent_info::build_merkle_list)
// TODO: torrent with multiple trackers in multiple tiers, making sure we shuffle them (how do you test shuffling?, load it multiple times and make sure it's in different order at least once)
// TODO: sanitize_append_path_element with all kinds of UTF-8 sequences, including invalid ones
// TODO: torrents with a missing name
// TODO: torrents with a zero-length name
// TODO: torrents with a merkle tree and add_merkle_nodes
@ -285,6 +284,11 @@ TORRENT_TEST(sanitize_path)
TEST_EQUAL(path, "a/c");
#endif
path.clear();
sanitize_append_path_element(path, "a", 1);
sanitize_append_path_element(path, "..", 2);
TEST_EQUAL(path, "a");
path.clear();
sanitize_append_path_element(path, "/..", 3);
sanitize_append_path_element(path, ".", 1);
@ -328,6 +332,201 @@ TORRENT_TEST(sanitize_path)
TEST_EQUAL(path, "c/c");
#endif
path.clear();
sanitize_append_path_element(path, "\b", 1);
TEST_EQUAL(path, "_");
path.clear();
sanitize_append_path_element(path, "\b", 1);
sanitize_append_path_element(path, "filename", 8);
#ifdef TORRENT_WINDOWS
TEST_EQUAL(path, "_\\filename");
#else
TEST_EQUAL(path, "_/filename");
#endif
path.clear();
sanitize_append_path_element(path, "filename", 8);
sanitize_append_path_element(path, "\b", 1);
#ifdef TORRENT_WINDOWS
TEST_EQUAL(path, "filename\\_");
#else
TEST_EQUAL(path, "filename/_");
#endif
path.clear();
sanitize_append_path_element(path, "abc", 3);
sanitize_append_path_element(path, "", 0);
#ifdef TORRENT_WINDOWS
TEST_EQUAL(path, "abc\\_");
#else
TEST_EQUAL(path, "abc/_");
#endif
path.clear();
sanitize_append_path_element(path, "abc", 3);
sanitize_append_path_element(path, " ", 3);
TEST_EQUAL(path, "abc");
path.clear();
sanitize_append_path_element(path, "", 0);
sanitize_append_path_element(path, "abc", 3);
#ifdef TORRENT_WINDOWS
TEST_EQUAL(path, "_\\abc");
#else
TEST_EQUAL(path, "_/abc");
#endif
path.clear();
sanitize_append_path_element(path, "\b?filename=4", 12);
#ifdef TORRENT_WINDOWS
TEST_EQUAL(path, "__filename=4");
#else
TEST_EQUAL(path, "_?filename=4");
#endif
path.clear();
sanitize_append_path_element(path, "filename=4", 10);
TEST_EQUAL(path, "filename=4");
// valid 2-byte sequence
path.clear();
sanitize_append_path_element(path, "filename\xc2\xa1", 10);
TEST_EQUAL(path, "filename\xc2\xa1");
// truncated 2-byte sequence
path.clear();
sanitize_append_path_element(path, "filename\xc2", 9);
TEST_EQUAL(path, "filename_");
// valid 3-byte sequence
path.clear();
sanitize_append_path_element(path, "filename\xe2\x9f\xb9", 11);
TEST_EQUAL(path, "filename\xe2\x9f\xb9");
// truncated 3-byte sequence
path.clear();
sanitize_append_path_element(path, "filename\xe2\x9f", 10);
TEST_EQUAL(path, "filename_");
// truncated 3-byte sequence
path.clear();
sanitize_append_path_element(path, "filename\xe2", 9);
TEST_EQUAL(path, "filename_");
// valid 4-byte sequence
path.clear();
sanitize_append_path_element(path, "filename\xf0\x9f\x92\x88", 12);
TEST_EQUAL(path, "filename\xf0\x9f\x92\x88");
// truncated 4-byte sequence
path.clear();
sanitize_append_path_element(path, "filename\xf0\x9f\x92", 11);
TEST_EQUAL(path, "filename_");
// 5-byte utf-8 sequence (not allowed)
path.clear();
sanitize_append_path_element(path, "filename\xf8\x9f\x9f\x9f\x9f" "foobar", 19);
TEST_EQUAL(path, "filename_____foobar");
// redundant (overlong) 2-byte sequence
// ascii code 0x2e encoded with a leading 0
path.clear();
sanitize_append_path_element(path, "filename\xc0\xae", 10);
TEST_EQUAL(path, "filename_");
// redundant (overlong) 3-byte sequence
// ascii code 0x2e encoded with two leading 0s
path.clear();
sanitize_append_path_element(path, "filename\xe0\x80\xae", 11);
TEST_EQUAL(path, "filename_");
// redundant (overlong) 4-byte sequence
// ascii code 0x2e encoded with three leading 0s
path.clear();
sanitize_append_path_element(path, "filename\xf0\x80\x80\xae", 12);
TEST_EQUAL(path, "filename_");
}
TORRENT_TEST(verify_encoding)
{
// verify_encoding
std::string test = "\b?filename=4";
TEST_CHECK(verify_encoding(test));
TEST_CHECK(test == "\b?filename=4");
test = "filename=4";
TEST_CHECK(verify_encoding(test));
TEST_CHECK(test == "filename=4");
// valid 2-byte sequence
test = "filename\xc2\xa1";
TEST_CHECK(verify_encoding(test));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename\xc2\xa1");
// truncated 2-byte sequence
test = "filename\xc2";
TEST_CHECK(!verify_encoding(test));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename_");
// valid 3-byte sequence
test = "filename\xe2\x9f\xb9";
TEST_CHECK(verify_encoding(test));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename\xe2\x9f\xb9");
// truncated 3-byte sequence
test = "filename\xe2\x9f";
TEST_CHECK(!verify_encoding(test));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename_");
// truncated 3-byte sequence
test = "filename\xe2";
TEST_CHECK(!verify_encoding(test));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename_");
// valid 4-byte sequence
test = "filename\xf0\x9f\x92\x88";
TEST_CHECK(verify_encoding(test));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename\xf0\x9f\x92\x88");
// truncated 4-byte sequence
test = "filename\xf0\x9f\x92";
TEST_CHECK(!verify_encoding(test));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename_");
// 5-byte utf-8 sequence (not allowed)
test = "filename\xf8\x9f\x9f\x9f\x9f""foobar";
TEST_CHECK(!verify_encoding(test));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename_____foobar");
// redundant (overlong) 2-byte sequence
// ascii code 0x2e encoded with a leading 0
test = "filename\xc0\xae";
TEST_CHECK(!verify_encoding(test));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename__");
// redundant (overlong) 3-byte sequence
// ascii code 0x2e encoded with two leading 0s
test = "filename\xe0\x80\xae";
TEST_CHECK(!verify_encoding(test));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename___");
// redundant (overlong) 4-byte sequence
// ascii code 0x2e encoded with three leading 0s
test = "filename\xf0\x80\x80\xae";
TEST_CHECK(!verify_encoding(test));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename____");
}
TORRENT_TEST(parse)
@ -375,88 +574,6 @@ TORRENT_TEST(parse)
std::cerr << ti3.name() << std::endl;
TEST_EQUAL(ti3.name(), "test2..test3.......test4");
// verify_encoding
std::string test = "\b?filename=4";
TEST_CHECK(!verify_encoding(test, true));
#ifdef TORRENT_WINDOWS
TEST_CHECK(test == "__filename=4");
#else
TEST_CHECK(test == "_?filename=4");
#endif
test = "filename=4";
TEST_CHECK(verify_encoding(test, true));
TEST_CHECK(test == "filename=4");
// valid 2-byte sequence
test = "filename\xc2\xa1";
TEST_CHECK(verify_encoding(test, true));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename\xc2\xa1");
// truncated 2-byte sequence
test = "filename\xc2";
TEST_CHECK(!verify_encoding(test, true));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename_");
// valid 3-byte sequence
test = "filename\xe2\x9f\xb9";
TEST_CHECK(verify_encoding(test, true));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename\xe2\x9f\xb9");
// truncated 3-byte sequence
test = "filename\xe2\x9f";
TEST_CHECK(!verify_encoding(test, true));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename_");
// truncated 3-byte sequence
test = "filename\xe2";
TEST_CHECK(!verify_encoding(test, true));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename_");
// valid 4-byte sequence
test = "filename\xf0\x9f\x92\x88";
TEST_CHECK(verify_encoding(test, true));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename\xf0\x9f\x92\x88");
// truncated 4-byte sequence
test = "filename\xf0\x9f\x92";
TEST_CHECK(!verify_encoding(test, true));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename_");
// 5-byte utf-8 sequence (not allowed)
test = "filename\xf8\x9f\x9f\x9f\x9f""foobar";
TEST_CHECK(!verify_encoding(test, true));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename_____foobar");
// redundant (overlong) 2-byte sequence
// ascii code 0x2e encoded with a leading 0
test = "filename\xc0\xae";
TEST_CHECK(!verify_encoding(test, true));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename__");
// redundant (overlong) 3-byte sequence
// ascii code 0x2e encoded with two leading 0s
test = "filename\xe0\x80\xae";
TEST_CHECK(!verify_encoding(test, true));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename___");
// redundant (overlong) 4-byte sequence
// ascii code 0x2e encoded with three leading 0s
test = "filename\xf0\x80\x80\xae";
TEST_CHECK(!verify_encoding(test, true));
fprintf(stderr, "%s\n", test.c_str());
TEST_CHECK(test == "filename____");
std::string root_dir = parent_path(current_working_directory());
for (int i = 0; i < int(sizeof(test_torrents)/sizeof(test_torrents[0])); ++i)
{