improve path sanitization (filter unicode text direction characters)

This commit is contained in:
arvidn 2017-05-14 20:58:51 -04:00 committed by Arvid Norberg
parent 9333c51da6
commit 56d5d795bf
7 changed files with 109 additions and 112 deletions

View File

@ -1,3 +1,4 @@
* improve path sanitization (filter unicode text direction characters)
* deprecate partial_piece_info::piece_state
* bind upnp requests to correct local address
* save resume data when removing web seeds

View File

@ -159,6 +159,11 @@ TORRENT_EXTRA_EXPORT ConversionResult ConvertUTF32toUTF16 (
TORRENT_EXTRA_EXPORT Boolean isLegalUTF8Sequence(const UTF8 *source,
const UTF8 *sourceEnd);
TORRENT_EXTRA_EXPORT Boolean isLegalUTF8(const UTF8 *source, int length);
extern const char trailingBytesForUTF8[256];
extern const UTF32 offsetsFromUTF8[6];
#ifdef __cplusplus
}
#endif

View File

@ -39,6 +39,7 @@ POSSIBILITY OF SUCH DAMAGE.
// convert_to_native and convert_from_native
#if TORRENT_USE_WSTRING || defined TORRENT_WINDOWS
#include <boost/cstdint.hpp>
#include <string>
#include <cwchar>
@ -71,6 +72,9 @@ namespace libtorrent
const std::string &utf8, std::wstring &wide);
TORRENT_EXTRA_EXPORT utf8_conv_result_t wchar_utf8(
const std::wstring &wide, std::string &utf8);
TORRENT_EXTRA_EXPORT std::pair<boost::int32_t, int>
parse_utf8_codepoint(char const* str, int len);
}
#endif // !BOOST_NO_STD_WSTRING

View File

@ -171,7 +171,7 @@ if (result == sourceIllegal) {
* left as-is for anyone who may want to do such conversion, which was
* allowed in earlier algorithms.
*/
static const char trailingBytesForUTF8[256] = {
const char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@ -187,7 +187,7 @@ static const char trailingBytesForUTF8[256] = {
* This table contains as many values as there might be trailing bytes
* in a UTF-8 sequence.
*/
static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL };
/*
@ -292,7 +292,7 @@ ConversionResult ConvertUTF16toUTF8 (
* definition of UTF-8 goes up to 4-byte sequences.
*/
static Boolean isLegalUTF8(const UTF8 *source, int length) {
Boolean isLegalUTF8(const UTF8 *source, int length) {
UTF8 a;
const UTF8 *srcptr = source+length;
switch (length) {

View File

@ -59,6 +59,8 @@ POSSIBILITY OF SUCH DAMAGE.
#include <boost/bind.hpp>
#include <boost/assert.hpp>
#include <boost/unordered_set.hpp>
#include <boost/array.hpp>
#include <boost/cstdint.hpp>
#include <iterator>
#include <algorithm>
@ -81,15 +83,33 @@ namespace libtorrent
namespace {
bool valid_path_character(char c)
bool valid_path_character(boost::int32_t const c)
{
#ifdef TORRENT_WINDOWS
static const char invalid_chars[] = "?<>\"|\b*:";
#else
static const char invalid_chars[] = "";
#endif
if (c >= 0 && c < 32) return false;
return std::strchr(invalid_chars, c) == 0;
if (c < 32) return false;
if (c > 127) return true;
return std::strchr(invalid_chars, static_cast<char>(c)) == NULL;
}
bool filter_path_character(boost::int32_t const c)
{
// these unicode characters change the writing writing direction of the
// string and can be used for attacks:
// https://security.stackexchange.com/questions/158802/how-can-this-executable-have-an-avi-extension
static const boost::array<boost::int32_t, 7> bad_cp = {{0x202a, 0x202b, 0x202c, 0x202d, 0x202e, 0x200e, 0x200f}};
if (std::find(bad_cp.begin(), bad_cp.end(), c) != bad_cp.end()) return true;
#ifdef TORRENT_WINDOWS
static const char invalid_chars[] = "/\\:";
#else
static const char invalid_chars[] = "/\\";
#endif
if (c > 127) return false;
return std::strchr(invalid_chars, static_cast<char>(c)) != NULL;
}
} // anonymous namespace
@ -223,118 +243,35 @@ namespace libtorrent
// the number of dots we've added
char num_dots = 0;
bool found_extension = false;
for (int i = 0; i < element_len; ++i)
int seq_len = 0;
for (int i = 0; i < element_len; i += seq_len)
{
if (element[i] == '/'
|| element[i] == '\\'
#ifdef TORRENT_WINDOWS
|| element[i] == ':'
#endif
)
boost::int32_t code_point;
boost::tie(code_point, seq_len) = parse_utf8_codepoint(element + i, element_len - i);
if (code_point >= 0 && filter_path_character(code_point))
{
continue;
}
if (element[i] == '.') ++num_dots;
int last_len = 0;
if ((element[i] & 0x80) == 0)
{
// 1 byte
if (valid_path_character(element[i]))
{
path += element[i];
}
else
{
path += '_';
}
last_len = 1;
}
else if ((element[i] & 0xe0) == 0xc0)
{
// 2 bytes
if (element_len - i < 2
|| (element[i+1] & 0xc0) != 0x80)
{
path += '_';
last_len = 1;
}
else if ((element[i] & 0x1f) == 0)
{
// overlong sequences are invalid
path += '_';
last_len = 1;
}
else
{
path += element[i];
path += element[i+1];
last_len = 2;
}
i += 1;
}
else if ((element[i] & 0xf0) == 0xe0)
{
// 3 bytes
if (element_len - i < 3
|| (element[i+1] & 0xc0) != 0x80
|| (element[i+2] & 0xc0) != 0x80
)
{
path += '_';
last_len = 1;
}
else if ((element[i] & 0x0f) == 0)
{
// overlong sequences are invalid
path += '_';
last_len = 1;
}
else
{
path += element[i];
path += element[i+1];
path += element[i+2];
last_len = 3;
}
i += 2;
}
else if ((element[i] & 0xf8) == 0xf0)
{
// 4 bytes
if (element_len - i < 4
|| (element[i+1] & 0xc0) != 0x80
|| (element[i+2] & 0xc0) != 0x80
|| (element[i+3] & 0xc0) != 0x80
)
{
path += '_';
last_len = 1;
}
else if ((element[i] & 0x07) == 0
&& (element[i+1] & 0x3f) == 0)
{
// overlong sequences are invalid
path += '_';
last_len = 1;
}
else
{
path += element[i];
path += element[i+1];
path += element[i+2];
path += element[i+3];
last_len = 4;
}
i += 3;
}
else
if (code_point < 0
|| !valid_path_character(code_point))
{
// invalid utf8 sequence, replace with "_"
path += '_';
last_len = 1;
++added;
++unicode_chars;
continue;
}
added += last_len;
// validation passed, add it to the output string
for (int k = i; k < i + seq_len; ++k)
path.push_back(element[k]);
if (code_point == '.') ++num_dots;
added += seq_len;
++unicode_chars;
// any given path element should not

View File

@ -197,6 +197,39 @@ namespace libtorrent
return convert_from_wide<sizeof(wchar_t)>::convert(
&src_start, src_start + wide.size(), utf8);
}
// returns the unicode codepoint and the number of bytes of the utf8 sequence
// that was parsed. The codepoint is -1 if it's invalid
std::pair<boost::int32_t, int> parse_utf8_codepoint(char const* str, int const len)
{
int const sequence_len = trailingBytesForUTF8[static_cast<boost::uint8_t>(*str)] + 1;
if (sequence_len > len) return std::make_pair(-1, len);
if (sequence_len > 4)
{
return std::make_pair(-1, sequence_len);
}
if (!isLegalUTF8(reinterpret_cast<UTF8 const*>(str), sequence_len))
{
return std::make_pair(-1, sequence_len);
}
boost::uint32_t ch = 0;
for (int i = 0; i < sequence_len; ++i)
{
ch <<= 6;
ch += static_cast<boost::uint8_t>(str[i]);
}
ch -= offsetsFromUTF8[sequence_len-1];
if (ch > 0x7fffffff)
{
return std::make_pair(-1, sequence_len);
}
return std::make_pair(static_cast<boost::int32_t>(ch), sequence_len);
}
}
#ifdef __clang__

View File

@ -468,7 +468,7 @@ TORRENT_TEST(sanitize_path)
// 5-byte utf-8 sequence (not allowed)
path.clear();
sanitize_append_path_element(path, "filename\xf8\x9f\x9f\x9f\x9f" "foobar", 19);
TEST_EQUAL(path, "filename_____foobar");
TEST_EQUAL(path, "filename_foobar");
// redundant (overlong) 2-byte sequence
// ascii code 0x2e encoded with a leading 0
@ -487,6 +487,23 @@ TORRENT_TEST(sanitize_path)
path.clear();
sanitize_append_path_element(path, "filename\xf0\x80\x80\xae", 12);
TEST_EQUAL(path, "filename_");
// a filename where every character is filtered is not replaced by an understcore
path.clear();
sanitize_append_path_element(path, "//\\", 3);
TEST_EQUAL(path, "");
// make sure suspicious unicode characters are filtered out
path.clear();
// that's utf-8 for U+200e LEFT-TO-RIGHT MARK
sanitize_append_path_element(path, "foo\xe2\x80\x8e" "bar", 9);
TEST_EQUAL(path, "foobar");
// make sure suspicious unicode characters are filtered out
path.clear();
// that's utf-8 for U+202b RIGHT-TO-LEFT EMBEDDING
sanitize_append_path_element(path, "foo\xe2\x80\xab" "bar", 9);
TEST_EQUAL(path, "foobar");
}
TORRENT_TEST(verify_encoding)