From 300d0e1ad4aff3895c3c8db06c1c4bdd72892641 Mon Sep 17 00:00:00 2001 From: Thomas Mullaly Date: Mon, 19 Jul 2010 21:05:30 -0400 Subject: [PATCH] urlmon: Implemented canonicalization function for hierarchical URI paths. --- dlls/urlmon/tests/uri.c | 181 ++++++++++++++++++++++++++++++++++++++++ dlls/urlmon/uri.c | 116 ++++++++++++++++++++++++- 2 files changed, 296 insertions(+), 1 deletion(-) diff --git a/dlls/urlmon/tests/uri.c b/dlls/urlmon/tests/uri.c index acf6905f21d..3b049403c41 100644 --- a/dlls/urlmon/tests/uri.c +++ b/dlls/urlmon/tests/uri.c @@ -2273,6 +2273,187 @@ static const uri_properties uri_tests[] = { {URL_SCHEME_UNKNOWN,S_OK,FALSE}, {URLZONE_INVALID,E_NOTIMPL,FALSE} } + }, + /* Since the original URI doesn't contain an extra '/' before the path no % encoded values + * are decoded and all '%' are encoded. + */ + { "file://C:/te%3Es%2Et/tes%t.mp3", 0, S_OK, FALSE, + Uri_HAS_ABSOLUTE_URI|Uri_HAS_DISPLAY_URI|Uri_HAS_EXTENSION|Uri_HAS_PATH| + Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|Uri_HAS_HOST_TYPE|Uri_HAS_SCHEME, + TRUE, + { + {"file:///C:/te%253Es%252Et/tes%25t.mp3",S_OK,TRUE}, + {"",S_FALSE,FALSE}, + {"file:///C:/te%253Es%252Et/tes%25t.mp3",S_OK,TRUE}, + {"",S_FALSE,FALSE}, + {".mp3",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,FALSE}, + {"/C:/te%253Es%252Et/tes%25t.mp3",S_OK,TRUE}, + {"/C:/te%253Es%252Et/tes%25t.mp3",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"file://C:/te%3Es%2Et/tes%t.mp3",S_OK,FALSE}, + {"file",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,FALSE} + }, + { + {Uri_HOST_UNKNOWN,S_OK,FALSE}, + {0,S_FALSE,FALSE}, + {URL_SCHEME_FILE,S_OK,FALSE}, + {URLZONE_INVALID,E_NOTIMPL,FALSE} + } + }, + /* Since there's a '/' in front of the drive letter, any percent encoded, non-forbidden character + * is decoded and only %'s in front of invalid hex digits are encoded. + */ + { "file:///C:/te%3Es%2Et/t%23es%t.mp3", 0, S_OK, FALSE, + Uri_HAS_ABSOLUTE_URI|Uri_HAS_DISPLAY_URI|Uri_HAS_EXTENSION|Uri_HAS_PATH| + Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|Uri_HAS_HOST_TYPE|Uri_HAS_SCHEME, + TRUE, + { + {"file:///C:/te%3Es.t/t#es%25t.mp3",S_OK,TRUE}, + {"",S_FALSE,FALSE}, + {"file:///C:/te%3Es.t/t#es%25t.mp3",S_OK,TRUE}, + {"",S_FALSE,FALSE}, + {".mp3",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,FALSE}, + {"/C:/te%3Es.t/t#es%25t.mp3",S_OK,TRUE}, + {"/C:/te%3Es.t/t#es%25t.mp3",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"file:///C:/te%3Es%2Et/t%23es%t.mp3",S_OK,FALSE}, + {"file",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,FALSE} + }, + { + {Uri_HOST_UNKNOWN,S_OK,FALSE}, + {0,S_FALSE,FALSE}, + {URL_SCHEME_FILE,S_OK,FALSE}, + {URLZONE_INVALID,E_NOTIMPL,FALSE} + } + }, + /* Only unreserved percent encoded characters are decoded for known schemes that aren't file. */ + { "http://[::001.002.003.000]/%3F%23%2E%54/test", 0, S_OK, FALSE, + Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_HOST| + Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME| + Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME, + TRUE, + { + {"http://[::1.2.3.0]/%3F%23.T/test",S_OK,TRUE}, + {"[::1.2.3.0]",S_OK,FALSE}, + {"http://[::1.2.3.0]/%3F%23.T/test",S_OK,TRUE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,TRUE}, + {"",S_FALSE,TRUE}, + {"::1.2.3.0",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"/%3F%23.T/test",S_OK,TRUE}, + {"/%3F%23.T/test",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"http://[::001.002.003.000]/%3F%23%2E%54/test",S_OK,FALSE}, + {"http",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,FALSE}, + }, + { + {Uri_HOST_IPV6,S_OK,FALSE}, + {80,S_OK,FALSE}, + {URL_SCHEME_HTTP,S_OK,FALSE}, + {URLZONE_INVALID,E_NOTIMPL,FALSE} + } + }, + /* Forbidden characters are always encoded for file URIs. */ + { "file:///C:/\"test\"/test.mp3", Uri_CREATE_NO_ENCODE_FORBIDDEN_CHARACTERS, S_OK, FALSE, + Uri_HAS_ABSOLUTE_URI|Uri_HAS_DISPLAY_URI|Uri_HAS_EXTENSION|Uri_HAS_PATH| + Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|Uri_HAS_HOST_TYPE|Uri_HAS_SCHEME, + TRUE, + { + {"file:///C:/%22test%22/test.mp3",S_OK,TRUE}, + {"",S_FALSE,FALSE}, + {"file:///C:/%22test%22/test.mp3",S_OK,TRUE}, + {"",S_FALSE,FALSE}, + {".mp3",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,FALSE}, + {"/C:/%22test%22/test.mp3",S_OK,TRUE}, + {"/C:/%22test%22/test.mp3",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"file:///C:/\"test\"/test.mp3",S_OK,FALSE}, + {"file",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,FALSE} + }, + { + {Uri_HOST_UNKNOWN,S_OK,FALSE}, + {0,S_FALSE,FALSE}, + {URL_SCHEME_FILE,S_OK,FALSE}, + {URLZONE_INVALID,E_NOTIMPL,FALSE} + } + }, + /* Forbidden characters are never encoded for unknown scheme types. */ + { "1234://4294967295/<|>\" test<|>", 0, S_OK, FALSE, + Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_HOST| + Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME| + Uri_HAS_HOST_TYPE|Uri_HAS_SCHEME, + TRUE, + { + {"1234://4294967295/<|>\" test<|>",S_OK,TRUE}, + {"4294967295",S_OK,FALSE}, + {"1234://4294967295/<|>\" test<|>",S_OK,TRUE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,TRUE}, + {"",S_FALSE,TRUE}, + {"4294967295",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"/<|>\" test<|>",S_OK,TRUE}, + {"/<|>\" test<|>",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"1234://4294967295/<|>\" test<|>",S_OK,FALSE}, + {"1234",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,FALSE} + }, + { + {Uri_HOST_IPV4,S_OK,FALSE}, + {0,S_FALSE,FALSE}, + {URL_SCHEME_UNKNOWN,S_OK,FALSE}, + {URLZONE_INVALID,E_NOTIMPL,FALSE} + } + }, + /* Make sure forbidden characters are percent encoded. */ + { "http://gov.uk/<|> test<|>", 0, S_OK, FALSE, + Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_HOST| + Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME| + Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME, + TRUE, + { + {"http://gov.uk/%3C%7C%3E%20test%3C%7C%3E",S_OK,TRUE}, + {"gov.uk",S_OK,FALSE}, + {"http://gov.uk/%3C%7C%3E%20test%3C%7C%3E",S_OK,TRUE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,TRUE}, + {"",S_FALSE,TRUE}, + {"gov.uk",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"/%3C%7C%3E%20test%3C%7C%3E",S_OK,TRUE}, + {"/%3C%7C%3E%20test%3C%7C%3E",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"http://gov.uk/<|> test<|>",S_OK,FALSE}, + {"http",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,FALSE} + }, + { + {Uri_HOST_DNS,S_OK,FALSE}, + {80,S_OK,FALSE}, + {URL_SCHEME_HTTP,S_OK,FALSE}, + {URLZONE_INVALID,E_NOTIMPL,FALSE} + } } }; diff --git a/dlls/urlmon/uri.c b/dlls/urlmon/uri.c index 1b98b1a359c..cc5a54874df 100644 --- a/dlls/urlmon/uri.c +++ b/dlls/urlmon/uri.c @@ -58,6 +58,9 @@ typedef struct { DWORD authority_len; INT domain_offset; + + INT path_start; + DWORD path_len; } Uri; typedef struct { @@ -2266,6 +2269,115 @@ static BOOL canonicalize_authority(const parse_data *data, Uri *uri, DWORD flags return TRUE; } +/* Attempts to canonicalize the path of a hierarchical URI. + * + * Things that happen: + * 1). Forbidden characters are percent encoded, unless the NO_ENCODE_FORBIDDEN + * flag is set or it's a file URI. Forbidden characters are always encoded + * for file schemes reguardless and forbidden characters are never encoded + * for unknown scheme types. + * + * 2). For known scheme types '\\' are changed to '/'. + * + * 3). Percent encoded, unreserved characters are decoded to their actual values. + * Unless the scheme type is unknown. For file schemes any percent encoded + * character in the unreserved or reserved set is decoded. + * + * 4). For File schemes if the path is starts with a drive letter and doesn't + * start with a '/' then one is appended. + * Ex: file://c:/test.mp3 -> file:///c:/test.mp3 + * + * 5). Dot segments are removed from the path for all scheme types + * unless NO_CANONICALIZE flag is set. Dot segments aren't removed + * for wildcard scheme types. + * + * NOTES: + * file://c:/test%20test -> file:///c:/test%2520test + * file://c:/test%3Etest -> file:///c:/test%253Etest + * file:///c:/test%20test -> file:///c:/test%20test + * file:///c:/test%test -> file:///c:/test%25test + */ +static BOOL canonicalize_path_hierarchical(const parse_data *data, Uri *uri, + DWORD flags, BOOL computeOnly) { + const WCHAR *ptr; + const BOOL known_scheme = data->scheme_type != URL_SCHEME_UNKNOWN; + const BOOL is_file = data->scheme_type == URL_SCHEME_FILE; + + BOOL escape_pct = FALSE; + + if(!data->path) { + uri->path_start = -1; + uri->path_len = 0; + return TRUE; + } + + uri->path_start = uri->canon_len; + + /* Check if a '/' needs to be appended for the file scheme. */ + if(is_file) { + if(data->path_len > 1 && is_alpha(*(data->path)) && + *(data->path+1) == ':') { + if(!computeOnly) + uri->canon_uri[uri->canon_len] = '/'; + uri->canon_len++; + escape_pct = TRUE; + } + } + + for(ptr = data->path; ptr < data->path+data->path_len; ++ptr) { + if(*ptr == '%') { + const WCHAR *tmp = ptr; + WCHAR val; + + /* Check if the % represents a valid encoded char, or if it needs encoded. */ + BOOL force_encode = !check_pct_encoded(&tmp) && is_file; + val = decode_pct_val(ptr); + + if(force_encode || escape_pct) { + /* Escape the percent sign in the file URI. */ + if(!computeOnly) + pct_encode_val(*ptr, uri->canon_uri+uri->canon_len); + uri->canon_len += 3; + } else if((is_unreserved(val) && known_scheme) || + (is_file && (is_unreserved(val) || is_reserved(val)))) { + if(!computeOnly) + uri->canon_uri[uri->canon_len] = val; + ++uri->canon_len; + + ptr += 2; + continue; + } else { + if(!computeOnly) + uri->canon_uri[uri->canon_len] = *ptr; + ++uri->canon_len; + } + } else if(*ptr == '\\' && known_scheme) { + if(!computeOnly) + uri->canon_uri[uri->canon_len] = '/'; + ++uri->canon_len; + } else if(known_scheme && !is_unreserved(*ptr) && !is_reserved(*ptr) && + (!(flags & Uri_CREATE_NO_ENCODE_FORBIDDEN_CHARACTERS) || is_file)) { + /* Escape the forbidden character. */ + if(!computeOnly) + pct_encode_val(*ptr, uri->canon_uri+uri->canon_len); + uri->canon_len += 3; + } else { + if(!computeOnly) + uri->canon_uri[uri->canon_len] = *ptr; + ++uri->canon_len; + } + } + + uri->path_len = uri->canon_len - uri->path_start; + + if(!computeOnly) + TRACE("Canonicalized path %s len=%d\n", + debugstr_wn(uri->canon_uri+uri->path_start, uri->path_len), + uri->path_len); + + return TRUE; +} + /* Determines how the URI represented by the parse_data should be canonicalized. * * Essentially, if the parse_data represents an hierarchical URI then it calls @@ -2288,7 +2400,9 @@ static BOOL canonicalize_hierpart(const parse_data *data, Uri *uri, DWORD flags, if(!canonicalize_authority(data, uri, flags, computeOnly)) return FALSE; - /* TODO: Canonicalize the path of the URI. */ + /* TODO: Canonicalize the path of the URI. */ + if(!canonicalize_path_hierarchical(data, uri, flags, computeOnly)) + return FALSE; } else { /* Opaque URI's don't have an authority. */