diff --git a/ChangeLog b/ChangeLog index 0b0322552..3606c4607 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,4 @@ + * improve path sanitization (filter unicode text direction characters) * deprecate partial_piece_info::piece_state * bind upnp requests to correct local address * save resume data when removing web seeds diff --git a/include/libtorrent/ConvertUTF.h b/include/libtorrent/ConvertUTF.h index f8b14ddd2..f534c3969 100644 --- a/include/libtorrent/ConvertUTF.h +++ b/include/libtorrent/ConvertUTF.h @@ -159,6 +159,11 @@ TORRENT_EXTRA_EXPORT ConversionResult ConvertUTF32toUTF16 ( TORRENT_EXTRA_EXPORT Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd); +TORRENT_EXTRA_EXPORT Boolean isLegalUTF8(const UTF8 *source, int length); + +extern const char trailingBytesForUTF8[256]; +extern const UTF32 offsetsFromUTF8[6]; + #ifdef __cplusplus } #endif diff --git a/include/libtorrent/utf8.hpp b/include/libtorrent/utf8.hpp index 50856f334..7e5f1793d 100644 --- a/include/libtorrent/utf8.hpp +++ b/include/libtorrent/utf8.hpp @@ -39,6 +39,7 @@ POSSIBILITY OF SUCH DAMAGE. // convert_to_native and convert_from_native #if TORRENT_USE_WSTRING || defined TORRENT_WINDOWS +#include #include #include @@ -71,6 +72,9 @@ namespace libtorrent const std::string &utf8, std::wstring &wide); TORRENT_EXTRA_EXPORT utf8_conv_result_t wchar_utf8( const std::wstring &wide, std::string &utf8); + + TORRENT_EXTRA_EXPORT std::pair + parse_utf8_codepoint(char const* str, int len); } #endif // !BOOST_NO_STD_WSTRING diff --git a/src/ConvertUTF.cpp b/src/ConvertUTF.cpp index 2a4bb6992..60ba17df1 100644 --- a/src/ConvertUTF.cpp +++ b/src/ConvertUTF.cpp @@ -171,7 +171,7 @@ if (result == sourceIllegal) { * left as-is for anyone who may want to do such conversion, which was * allowed in earlier algorithms. */ -static const char trailingBytesForUTF8[256] = { +const char trailingBytesForUTF8[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, @@ -187,7 +187,7 @@ static const char trailingBytesForUTF8[256] = { * This table contains as many values as there might be trailing bytes * in a UTF-8 sequence. */ -static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, +const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; /* @@ -292,7 +292,7 @@ ConversionResult ConvertUTF16toUTF8 ( * definition of UTF-8 goes up to 4-byte sequences. */ -static Boolean isLegalUTF8(const UTF8 *source, int length) { +Boolean isLegalUTF8(const UTF8 *source, int length) { UTF8 a; const UTF8 *srcptr = source+length; switch (length) { diff --git a/src/torrent_info.cpp b/src/torrent_info.cpp index 482bd96b3..d5d0a0b24 100644 --- a/src/torrent_info.cpp +++ b/src/torrent_info.cpp @@ -59,6 +59,8 @@ POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include +#include #include #include @@ -81,15 +83,33 @@ namespace libtorrent namespace { - bool valid_path_character(char c) + bool valid_path_character(boost::int32_t const c) { #ifdef TORRENT_WINDOWS static const char invalid_chars[] = "?<>\"|\b*:"; #else static const char invalid_chars[] = ""; #endif - if (c >= 0 && c < 32) return false; - return std::strchr(invalid_chars, c) == 0; + if (c < 32) return false; + if (c > 127) return true; + return std::strchr(invalid_chars, static_cast(c)) == NULL; + } + + bool filter_path_character(boost::int32_t const c) + { + // these unicode characters change the writing writing direction of the + // string and can be used for attacks: + // https://security.stackexchange.com/questions/158802/how-can-this-executable-have-an-avi-extension + static const boost::array bad_cp = {{0x202a, 0x202b, 0x202c, 0x202d, 0x202e, 0x200e, 0x200f}}; + if (std::find(bad_cp.begin(), bad_cp.end(), c) != bad_cp.end()) return true; + +#ifdef TORRENT_WINDOWS + static const char invalid_chars[] = "/\\:"; +#else + static const char invalid_chars[] = "/\\"; +#endif + if (c > 127) return false; + return std::strchr(invalid_chars, static_cast(c)) != NULL; } } // anonymous namespace @@ -223,118 +243,35 @@ namespace libtorrent // the number of dots we've added char num_dots = 0; bool found_extension = false; - for (int i = 0; i < element_len; ++i) + + int seq_len = 0; + for (int i = 0; i < element_len; i += seq_len) { - if (element[i] == '/' - || element[i] == '\\' -#ifdef TORRENT_WINDOWS - || element[i] == ':' -#endif - ) + boost::int32_t code_point; + boost::tie(code_point, seq_len) = parse_utf8_codepoint(element + i, element_len - i); + + if (code_point >= 0 && filter_path_character(code_point)) + { continue; + } - if (element[i] == '.') ++num_dots; - - int last_len = 0; - - if ((element[i] & 0x80) == 0) - { - // 1 byte - if (valid_path_character(element[i])) - { - path += element[i]; - } - else - { - path += '_'; - } - last_len = 1; - } - else if ((element[i] & 0xe0) == 0xc0) - { - // 2 bytes - if (element_len - i < 2 - || (element[i+1] & 0xc0) != 0x80) - { - path += '_'; - last_len = 1; - } - else if ((element[i] & 0x1f) == 0) - { - // overlong sequences are invalid - path += '_'; - last_len = 1; - } - else - { - path += element[i]; - path += element[i+1]; - last_len = 2; - } - i += 1; - } - else if ((element[i] & 0xf0) == 0xe0) - { - // 3 bytes - if (element_len - i < 3 - || (element[i+1] & 0xc0) != 0x80 - || (element[i+2] & 0xc0) != 0x80 - ) - { - path += '_'; - last_len = 1; - } - else if ((element[i] & 0x0f) == 0) - { - // overlong sequences are invalid - path += '_'; - last_len = 1; - } - else - { - path += element[i]; - path += element[i+1]; - path += element[i+2]; - last_len = 3; - } - i += 2; - } - else if ((element[i] & 0xf8) == 0xf0) - { - // 4 bytes - if (element_len - i < 4 - || (element[i+1] & 0xc0) != 0x80 - || (element[i+2] & 0xc0) != 0x80 - || (element[i+3] & 0xc0) != 0x80 - ) - { - path += '_'; - last_len = 1; - } - else if ((element[i] & 0x07) == 0 - && (element[i+1] & 0x3f) == 0) - { - // overlong sequences are invalid - path += '_'; - last_len = 1; - } - else - { - path += element[i]; - path += element[i+1]; - path += element[i+2]; - path += element[i+3]; - last_len = 4; - } - i += 3; - } - else + if (code_point < 0 + || !valid_path_character(code_point)) { + // invalid utf8 sequence, replace with "_" path += '_'; - last_len = 1; + ++added; + ++unicode_chars; + continue; } - added += last_len; + // validation passed, add it to the output string + for (int k = i; k < i + seq_len; ++k) + path.push_back(element[k]); + + if (code_point == '.') ++num_dots; + + added += seq_len; ++unicode_chars; // any given path element should not diff --git a/src/utf8.cpp b/src/utf8.cpp index 13d717a2b..703677340 100644 --- a/src/utf8.cpp +++ b/src/utf8.cpp @@ -197,6 +197,39 @@ namespace libtorrent return convert_from_wide::convert( &src_start, src_start + wide.size(), utf8); } + + // returns the unicode codepoint and the number of bytes of the utf8 sequence + // that was parsed. The codepoint is -1 if it's invalid + std::pair parse_utf8_codepoint(char const* str, int const len) + { + int const sequence_len = trailingBytesForUTF8[static_cast(*str)] + 1; + if (sequence_len > len) return std::make_pair(-1, len); + + if (sequence_len > 4) + { + return std::make_pair(-1, sequence_len); + } + + if (!isLegalUTF8(reinterpret_cast(str), sequence_len)) + { + return std::make_pair(-1, sequence_len); + } + + boost::uint32_t ch = 0; + for (int i = 0; i < sequence_len; ++i) + { + ch <<= 6; + ch += static_cast(str[i]); + } + ch -= offsetsFromUTF8[sequence_len-1]; + + if (ch > 0x7fffffff) + { + return std::make_pair(-1, sequence_len); + } + + return std::make_pair(static_cast(ch), sequence_len); + } } #ifdef __clang__ diff --git a/test/test_torrent_info.cpp b/test/test_torrent_info.cpp index e2f80782c..5cb081cf3 100644 --- a/test/test_torrent_info.cpp +++ b/test/test_torrent_info.cpp @@ -468,7 +468,7 @@ TORRENT_TEST(sanitize_path) // 5-byte utf-8 sequence (not allowed) path.clear(); sanitize_append_path_element(path, "filename\xf8\x9f\x9f\x9f\x9f" "foobar", 19); - TEST_EQUAL(path, "filename_____foobar"); + TEST_EQUAL(path, "filename_foobar"); // redundant (overlong) 2-byte sequence // ascii code 0x2e encoded with a leading 0 @@ -487,6 +487,23 @@ TORRENT_TEST(sanitize_path) path.clear(); sanitize_append_path_element(path, "filename\xf0\x80\x80\xae", 12); TEST_EQUAL(path, "filename_"); + + // a filename where every character is filtered is not replaced by an understcore + path.clear(); + sanitize_append_path_element(path, "//\\", 3); + TEST_EQUAL(path, ""); + + // make sure suspicious unicode characters are filtered out + path.clear(); + // that's utf-8 for U+200e LEFT-TO-RIGHT MARK + sanitize_append_path_element(path, "foo\xe2\x80\x8e" "bar", 9); + TEST_EQUAL(path, "foobar"); + + // make sure suspicious unicode characters are filtered out + path.clear(); + // that's utf-8 for U+202b RIGHT-TO-LEFT EMBEDDING + sanitize_append_path_element(path, "foo\xe2\x80\xab" "bar", 9); + TEST_EQUAL(path, "foobar"); } TORRENT_TEST(verify_encoding)