urlmon: Implemented canonicalization function for hierarchical URI paths.

This commit is contained in:
Thomas Mullaly 2010-07-19 21:05:30 -04:00 committed by Alexandre Julliard
parent 93d79ee18e
commit 300d0e1ad4
2 changed files with 296 additions and 1 deletions

View File

@ -2273,6 +2273,187 @@ static const uri_properties uri_tests[] = {
{URL_SCHEME_UNKNOWN,S_OK,FALSE},
{URLZONE_INVALID,E_NOTIMPL,FALSE}
}
},
/* Since the original URI doesn't contain an extra '/' before the path no % encoded values
* are decoded and all '%' are encoded.
*/
{ "file://C:/te%3Es%2Et/tes%t.mp3", 0, S_OK, FALSE,
Uri_HAS_ABSOLUTE_URI|Uri_HAS_DISPLAY_URI|Uri_HAS_EXTENSION|Uri_HAS_PATH|
Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|Uri_HAS_HOST_TYPE|Uri_HAS_SCHEME,
TRUE,
{
{"file:///C:/te%253Es%252Et/tes%25t.mp3",S_OK,TRUE},
{"",S_FALSE,FALSE},
{"file:///C:/te%253Es%252Et/tes%25t.mp3",S_OK,TRUE},
{"",S_FALSE,FALSE},
{".mp3",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"",S_FALSE,FALSE},
{"",S_FALSE,FALSE},
{"/C:/te%253Es%252Et/tes%25t.mp3",S_OK,TRUE},
{"/C:/te%253Es%252Et/tes%25t.mp3",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"file://C:/te%3Es%2Et/tes%t.mp3",S_OK,FALSE},
{"file",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"",S_FALSE,FALSE}
},
{
{Uri_HOST_UNKNOWN,S_OK,FALSE},
{0,S_FALSE,FALSE},
{URL_SCHEME_FILE,S_OK,FALSE},
{URLZONE_INVALID,E_NOTIMPL,FALSE}
}
},
/* Since there's a '/' in front of the drive letter, any percent encoded, non-forbidden character
* is decoded and only %'s in front of invalid hex digits are encoded.
*/
{ "file:///C:/te%3Es%2Et/t%23es%t.mp3", 0, S_OK, FALSE,
Uri_HAS_ABSOLUTE_URI|Uri_HAS_DISPLAY_URI|Uri_HAS_EXTENSION|Uri_HAS_PATH|
Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|Uri_HAS_HOST_TYPE|Uri_HAS_SCHEME,
TRUE,
{
{"file:///C:/te%3Es.t/t#es%25t.mp3",S_OK,TRUE},
{"",S_FALSE,FALSE},
{"file:///C:/te%3Es.t/t#es%25t.mp3",S_OK,TRUE},
{"",S_FALSE,FALSE},
{".mp3",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"",S_FALSE,FALSE},
{"",S_FALSE,FALSE},
{"/C:/te%3Es.t/t#es%25t.mp3",S_OK,TRUE},
{"/C:/te%3Es.t/t#es%25t.mp3",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"file:///C:/te%3Es%2Et/t%23es%t.mp3",S_OK,FALSE},
{"file",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"",S_FALSE,FALSE}
},
{
{Uri_HOST_UNKNOWN,S_OK,FALSE},
{0,S_FALSE,FALSE},
{URL_SCHEME_FILE,S_OK,FALSE},
{URLZONE_INVALID,E_NOTIMPL,FALSE}
}
},
/* Only unreserved percent encoded characters are decoded for known schemes that aren't file. */
{ "http://[::001.002.003.000]/%3F%23%2E%54/test", 0, S_OK, FALSE,
Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_HOST|
Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|
Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME,
TRUE,
{
{"http://[::1.2.3.0]/%3F%23.T/test",S_OK,TRUE},
{"[::1.2.3.0]",S_OK,FALSE},
{"http://[::1.2.3.0]/%3F%23.T/test",S_OK,TRUE},
{"",S_FALSE,FALSE},
{"",S_FALSE,TRUE},
{"",S_FALSE,TRUE},
{"::1.2.3.0",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"/%3F%23.T/test",S_OK,TRUE},
{"/%3F%23.T/test",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"http://[::001.002.003.000]/%3F%23%2E%54/test",S_OK,FALSE},
{"http",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"",S_FALSE,FALSE},
},
{
{Uri_HOST_IPV6,S_OK,FALSE},
{80,S_OK,FALSE},
{URL_SCHEME_HTTP,S_OK,FALSE},
{URLZONE_INVALID,E_NOTIMPL,FALSE}
}
},
/* Forbidden characters are always encoded for file URIs. */
{ "file:///C:/\"test\"/test.mp3", Uri_CREATE_NO_ENCODE_FORBIDDEN_CHARACTERS, S_OK, FALSE,
Uri_HAS_ABSOLUTE_URI|Uri_HAS_DISPLAY_URI|Uri_HAS_EXTENSION|Uri_HAS_PATH|
Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|Uri_HAS_HOST_TYPE|Uri_HAS_SCHEME,
TRUE,
{
{"file:///C:/%22test%22/test.mp3",S_OK,TRUE},
{"",S_FALSE,FALSE},
{"file:///C:/%22test%22/test.mp3",S_OK,TRUE},
{"",S_FALSE,FALSE},
{".mp3",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"",S_FALSE,FALSE},
{"",S_FALSE,FALSE},
{"/C:/%22test%22/test.mp3",S_OK,TRUE},
{"/C:/%22test%22/test.mp3",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"file:///C:/\"test\"/test.mp3",S_OK,FALSE},
{"file",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"",S_FALSE,FALSE}
},
{
{Uri_HOST_UNKNOWN,S_OK,FALSE},
{0,S_FALSE,FALSE},
{URL_SCHEME_FILE,S_OK,FALSE},
{URLZONE_INVALID,E_NOTIMPL,FALSE}
}
},
/* Forbidden characters are never encoded for unknown scheme types. */
{ "1234://4294967295/<|>\" test<|>", 0, S_OK, FALSE,
Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_HOST|
Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|
Uri_HAS_HOST_TYPE|Uri_HAS_SCHEME,
TRUE,
{
{"1234://4294967295/<|>\" test<|>",S_OK,TRUE},
{"4294967295",S_OK,FALSE},
{"1234://4294967295/<|>\" test<|>",S_OK,TRUE},
{"",S_FALSE,FALSE},
{"",S_FALSE,TRUE},
{"",S_FALSE,TRUE},
{"4294967295",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"/<|>\" test<|>",S_OK,TRUE},
{"/<|>\" test<|>",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"1234://4294967295/<|>\" test<|>",S_OK,FALSE},
{"1234",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"",S_FALSE,FALSE}
},
{
{Uri_HOST_IPV4,S_OK,FALSE},
{0,S_FALSE,FALSE},
{URL_SCHEME_UNKNOWN,S_OK,FALSE},
{URLZONE_INVALID,E_NOTIMPL,FALSE}
}
},
/* Make sure forbidden characters are percent encoded. */
{ "http://gov.uk/<|> test<|>", 0, S_OK, FALSE,
Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_HOST|
Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|
Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME,
TRUE,
{
{"http://gov.uk/%3C%7C%3E%20test%3C%7C%3E",S_OK,TRUE},
{"gov.uk",S_OK,FALSE},
{"http://gov.uk/%3C%7C%3E%20test%3C%7C%3E",S_OK,TRUE},
{"",S_FALSE,FALSE},
{"",S_FALSE,TRUE},
{"",S_FALSE,TRUE},
{"gov.uk",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"/%3C%7C%3E%20test%3C%7C%3E",S_OK,TRUE},
{"/%3C%7C%3E%20test%3C%7C%3E",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"http://gov.uk/<|> test<|>",S_OK,FALSE},
{"http",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"",S_FALSE,FALSE}
},
{
{Uri_HOST_DNS,S_OK,FALSE},
{80,S_OK,FALSE},
{URL_SCHEME_HTTP,S_OK,FALSE},
{URLZONE_INVALID,E_NOTIMPL,FALSE}
}
}
};

View File

@ -58,6 +58,9 @@ typedef struct {
DWORD authority_len;
INT domain_offset;
INT path_start;
DWORD path_len;
} Uri;
typedef struct {
@ -2266,6 +2269,115 @@ static BOOL canonicalize_authority(const parse_data *data, Uri *uri, DWORD flags
return TRUE;
}
/* Attempts to canonicalize the path of a hierarchical URI.
*
* Things that happen:
* 1). Forbidden characters are percent encoded, unless the NO_ENCODE_FORBIDDEN
* flag is set or it's a file URI. Forbidden characters are always encoded
* for file schemes reguardless and forbidden characters are never encoded
* for unknown scheme types.
*
* 2). For known scheme types '\\' are changed to '/'.
*
* 3). Percent encoded, unreserved characters are decoded to their actual values.
* Unless the scheme type is unknown. For file schemes any percent encoded
* character in the unreserved or reserved set is decoded.
*
* 4). For File schemes if the path is starts with a drive letter and doesn't
* start with a '/' then one is appended.
* Ex: file://c:/test.mp3 -> file:///c:/test.mp3
*
* 5). Dot segments are removed from the path for all scheme types
* unless NO_CANONICALIZE flag is set. Dot segments aren't removed
* for wildcard scheme types.
*
* NOTES:
* file://c:/test%20test -> file:///c:/test%2520test
* file://c:/test%3Etest -> file:///c:/test%253Etest
* file:///c:/test%20test -> file:///c:/test%20test
* file:///c:/test%test -> file:///c:/test%25test
*/
static BOOL canonicalize_path_hierarchical(const parse_data *data, Uri *uri,
DWORD flags, BOOL computeOnly) {
const WCHAR *ptr;
const BOOL known_scheme = data->scheme_type != URL_SCHEME_UNKNOWN;
const BOOL is_file = data->scheme_type == URL_SCHEME_FILE;
BOOL escape_pct = FALSE;
if(!data->path) {
uri->path_start = -1;
uri->path_len = 0;
return TRUE;
}
uri->path_start = uri->canon_len;
/* Check if a '/' needs to be appended for the file scheme. */
if(is_file) {
if(data->path_len > 1 && is_alpha(*(data->path)) &&
*(data->path+1) == ':') {
if(!computeOnly)
uri->canon_uri[uri->canon_len] = '/';
uri->canon_len++;
escape_pct = TRUE;
}
}
for(ptr = data->path; ptr < data->path+data->path_len; ++ptr) {
if(*ptr == '%') {
const WCHAR *tmp = ptr;
WCHAR val;
/* Check if the % represents a valid encoded char, or if it needs encoded. */
BOOL force_encode = !check_pct_encoded(&tmp) && is_file;
val = decode_pct_val(ptr);
if(force_encode || escape_pct) {
/* Escape the percent sign in the file URI. */
if(!computeOnly)
pct_encode_val(*ptr, uri->canon_uri+uri->canon_len);
uri->canon_len += 3;
} else if((is_unreserved(val) && known_scheme) ||
(is_file && (is_unreserved(val) || is_reserved(val)))) {
if(!computeOnly)
uri->canon_uri[uri->canon_len] = val;
++uri->canon_len;
ptr += 2;
continue;
} else {
if(!computeOnly)
uri->canon_uri[uri->canon_len] = *ptr;
++uri->canon_len;
}
} else if(*ptr == '\\' && known_scheme) {
if(!computeOnly)
uri->canon_uri[uri->canon_len] = '/';
++uri->canon_len;
} else if(known_scheme && !is_unreserved(*ptr) && !is_reserved(*ptr) &&
(!(flags & Uri_CREATE_NO_ENCODE_FORBIDDEN_CHARACTERS) || is_file)) {
/* Escape the forbidden character. */
if(!computeOnly)
pct_encode_val(*ptr, uri->canon_uri+uri->canon_len);
uri->canon_len += 3;
} else {
if(!computeOnly)
uri->canon_uri[uri->canon_len] = *ptr;
++uri->canon_len;
}
}
uri->path_len = uri->canon_len - uri->path_start;
if(!computeOnly)
TRACE("Canonicalized path %s len=%d\n",
debugstr_wn(uri->canon_uri+uri->path_start, uri->path_len),
uri->path_len);
return TRUE;
}
/* Determines how the URI represented by the parse_data should be canonicalized.
*
* Essentially, if the parse_data represents an hierarchical URI then it calls
@ -2288,7 +2400,9 @@ static BOOL canonicalize_hierpart(const parse_data *data, Uri *uri, DWORD flags,
if(!canonicalize_authority(data, uri, flags, computeOnly))
return FALSE;
/* TODO: Canonicalize the path of the URI. */
/* TODO: Canonicalize the path of the URI. */
if(!canonicalize_path_hierarchical(data, uri, flags, computeOnly))
return FALSE;
} else {
/* Opaque URI's don't have an authority. */