urlmon: Implemented function for finding the domain name of a URI.

This commit is contained in:
Thomas Mullaly 2010-07-16 11:16:56 -04:00 committed by Alexandre Julliard
parent e49241f953
commit 41513ef243
2 changed files with 444 additions and 0 deletions

View File

@ -1926,6 +1926,297 @@ static const uri_properties uri_tests[] = {
{URL_SCHEME_HTTP,S_OK,FALSE},
{URLZONE_INVALID,E_NOTIMPL,FALSE}
}
},
{ "http://google.com.uk", 0, S_OK, FALSE,
Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_DOMAIN|
Uri_HAS_HOST|Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|
Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME,
TRUE,
{
{"http://google.com.uk/",S_OK,TRUE},
{"google.com.uk",S_OK,FALSE},
{"http://google.com.uk/",S_OK,TRUE},
{"google.com.uk",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"",S_FALSE,TRUE},
{"google.com.uk",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"/",S_OK,TRUE},
{"/",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"http://google.com.uk",S_OK,FALSE},
{"http",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"",S_FALSE,FALSE}
},
{
{Uri_HOST_DNS,S_OK,FALSE},
{80,S_OK,FALSE},
{URL_SCHEME_HTTP,S_OK,FALSE},
{URLZONE_INVALID,E_NOTIMPL,FALSE}
}
},
{ "http://google.com.com", 0, S_OK, FALSE,
Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_DOMAIN|
Uri_HAS_HOST|Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|
Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME,
TRUE,
{
{"http://google.com.com/",S_OK,TRUE},
{"google.com.com",S_OK,FALSE},
{"http://google.com.com/",S_OK,TRUE},
{"com.com",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"",S_FALSE,TRUE},
{"google.com.com",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"/",S_OK,TRUE},
{"/",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"http://google.com.com",S_OK,FALSE},
{"http",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"",S_FALSE,FALSE}
},
{
{Uri_HOST_DNS,S_OK,FALSE},
{80,S_OK,FALSE},
{URL_SCHEME_HTTP,S_OK,FALSE},
{URLZONE_INVALID,E_NOTIMPL,FALSE}
}
},
{ "http://google.uk.1", 0, S_OK, FALSE,
Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_DOMAIN|
Uri_HAS_HOST|Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|
Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME,
TRUE,
{
{"http://google.uk.1/",S_OK,TRUE},
{"google.uk.1",S_OK,FALSE},
{"http://google.uk.1/",S_OK,TRUE},
{"google.uk.1",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"",S_FALSE,TRUE},
{"google.uk.1",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"/",S_OK,TRUE},
{"/",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"http://google.uk.1",S_OK,FALSE},
{"http",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"",S_FALSE,FALSE}
},
{
{Uri_HOST_DNS,S_OK,FALSE},
{80,S_OK,FALSE},
{URL_SCHEME_HTTP,S_OK,FALSE},
{URLZONE_INVALID,E_NOTIMPL,FALSE}
}
},
/* Since foo isn't a recognized 3 character TLD its considered the domain name. */
{ "http://google.foo.uk", 0, S_OK, FALSE,
Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_DOMAIN|
Uri_HAS_HOST|Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|
Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME,
TRUE,
{
{"http://google.foo.uk/",S_OK,TRUE},
{"google.foo.uk",S_OK,FALSE},
{"http://google.foo.uk/",S_OK,TRUE},
{"foo.uk",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"",S_FALSE,TRUE},
{"google.foo.uk",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"/",S_OK,TRUE},
{"/",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"http://google.foo.uk",S_OK,FALSE},
{"http",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"",S_FALSE,FALSE}
},
{
{Uri_HOST_DNS,S_OK,FALSE},
{80,S_OK,FALSE},
{URL_SCHEME_HTTP,S_OK,FALSE},
{URLZONE_INVALID,E_NOTIMPL,FALSE}
}
},
{ "http://.com", 0, S_OK, FALSE,
Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_DOMAIN|
Uri_HAS_HOST|Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|
Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME,
TRUE,
{
{"http://.com/",S_OK,TRUE},
{".com",S_OK,FALSE},
{"http://.com/",S_OK,TRUE},
{".com",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"",S_FALSE,TRUE},
{".com",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"/",S_OK,TRUE},
{"/",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"http://.com",S_OK,FALSE},
{"http",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"",S_FALSE,FALSE}
},
{
{Uri_HOST_DNS,S_OK,FALSE},
{80,S_OK,FALSE},
{URL_SCHEME_HTTP,S_OK,FALSE},
{URLZONE_INVALID,E_NOTIMPL,FALSE}
}
},
{ "http://.uk", 0, S_OK, FALSE,
Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_HOST|
Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|
Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME,
TRUE,
{
{"http://.uk/",S_OK,TRUE},
{".uk",S_OK,FALSE},
{"http://.uk/",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"",S_FALSE,TRUE},
{"",S_FALSE,TRUE},
{".uk",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"/",S_OK,TRUE},
{"/",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"http://.uk",S_OK,FALSE},
{"http",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"",S_FALSE,FALSE}
},
{
{Uri_HOST_DNS,S_OK,FALSE},
{80,S_OK,FALSE},
{URL_SCHEME_HTTP,S_OK,FALSE},
{URLZONE_INVALID,E_NOTIMPL,FALSE}
}
},
{ "http://www.co.google.com.[]", 0, S_OK, FALSE,
Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_DOMAIN|
Uri_HAS_HOST|Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|
Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME,
TRUE,
{
{"http://www.co.google.com.[]/",S_OK,TRUE},
{"www.co.google.com.[]",S_OK,FALSE},
{"http://www.co.google.com.[]/",S_OK,TRUE},
{"google.com.[]",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"",S_FALSE,TRUE},
{"www.co.google.com.[]",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"/",S_OK,TRUE},
{"/",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"http://www.co.google.com.[]",S_OK,FALSE},
{"http",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"",S_FALSE,FALSE}
},
{
{Uri_HOST_DNS,S_OK,FALSE},
{80,S_OK,FALSE},
{URL_SCHEME_HTTP,S_OK,FALSE},
{URLZONE_INVALID,E_NOTIMPL,FALSE}
}
},
{ "http://co.uk", 0, S_OK, FALSE,
Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_HOST|
Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|
Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME,
TRUE,
{
{"http://co.uk/",S_OK,TRUE},
{"co.uk",S_OK,FALSE},
{"http://co.uk/",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"",S_FALSE,TRUE},
{"",S_FALSE,TRUE},
{"co.uk",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"/",S_OK,TRUE},
{"/",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"http://co.uk",S_OK,FALSE},
{"http",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"",S_FALSE,FALSE}
},
{
{Uri_HOST_DNS,S_OK,FALSE},
{80,S_OK,FALSE},
{URL_SCHEME_HTTP,S_OK,FALSE},
{URLZONE_INVALID,E_NOTIMPL,FALSE}
}
},
{ "http://www.co.google.us.test", 0, S_OK, FALSE,
Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_DOMAIN|
Uri_HAS_HOST|Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|
Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME,
TRUE,
{
{"http://www.co.google.us.test/",S_OK,TRUE},
{"www.co.google.us.test",S_OK,FALSE},
{"http://www.co.google.us.test/",S_OK,TRUE},
{"us.test",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"",S_FALSE,TRUE},
{"www.co.google.us.test",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"/",S_OK,TRUE},
{"/",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"http://www.co.google.us.test",S_OK,FALSE},
{"http",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"",S_FALSE,FALSE}
},
{
{Uri_HOST_DNS,S_OK,FALSE},
{80,S_OK,FALSE},
{URL_SCHEME_HTTP,S_OK,FALSE},
{URLZONE_INVALID,E_NOTIMPL,FALSE}
}
},
{ "http://gov.uk", 0, S_OK, FALSE,
Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_HOST|
Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|
Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME,
TRUE,
{
{"http://gov.uk/",S_OK,TRUE},
{"gov.uk",S_OK,FALSE},
{"http://gov.uk/",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"",S_FALSE,TRUE},
{"",S_FALSE,TRUE},
{"gov.uk",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"/",S_OK,TRUE},
{"/",S_OK,TRUE},
{"",S_FALSE,TRUE},
{"http://gov.uk",S_OK,FALSE},
{"http",S_OK,FALSE},
{"",S_FALSE,FALSE},
{"",S_FALSE,FALSE}
},
{
{Uri_HOST_DNS,S_OK,FALSE},
{80,S_OK,FALSE},
{URL_SCHEME_HTTP,S_OK,FALSE},
{URLZONE_INVALID,E_NOTIMPL,FALSE}
}
}
};

View File

@ -56,6 +56,8 @@ typedef struct {
INT authority_start;
DWORD authority_len;
INT domain_offset;
} Uri;
typedef struct {
@ -158,6 +160,21 @@ static const struct {
{URL_SCHEME_HTTPS, 443},
};
/* List of 3 character top level domain names Windows seems to recognize.
* There might be more, but, these are the only ones I've found so far.
*/
static const struct {
WCHAR tld_name[4];
} recognized_tlds[] = {
{{'c','o','m',0}},
{{'e','d','u',0}},
{{'g','o','v',0}},
{{'i','n','t',0}},
{{'m','i','l',0}},
{{'n','e','t',0}},
{{'o','r','g',0}}
};
static inline BOOL is_alpha(WCHAR val) {
return ((val >= 'a' && val <= 'z') || (val >= 'A' && val <= 'Z'));
}
@ -312,6 +329,138 @@ static inline void pct_encode_val(WCHAR val, WCHAR *dest) {
dest[2] = hexDigits[val & 0xf];
}
/* Scans the range of characters [str, end] and returns the last occurence
* of 'ch' or returns NULL.
*/
static const WCHAR *str_last_of(const WCHAR *str, const WCHAR *end, WCHAR ch) {
const WCHAR *ptr = end;
while(ptr >= str) {
if(*ptr == ch)
return ptr;
--ptr;
}
return NULL;
}
/* Attempts to parse the domain name from the host.
*
* This function also includes the Top-level Domain (TLD) name
* of the host when it tries to find the domain name. If it finds
* a valid domain name it will assign 'domain_start' the offset
* into 'host' where the domain name starts.
*
* It's implied that if a domain name is found that it goes
* from [host+domain_start, host+host_len).
*/
static void find_domain_name(const WCHAR *host, DWORD host_len,
INT *domain_start) {
const WCHAR *last_tld, *sec_last_tld, *end;
end = host+host_len-1;
*domain_start = -1;
/* There has to be at least enough room for a '.' followed by a
* 3 character TLD for a domain to even exist in the host name.
*/
if(host_len < 4)
return;
last_tld = str_last_of(host, end, '.');
if(!last_tld)
/* http://hostname -> has no domain name. */
return;
sec_last_tld = str_last_of(host, last_tld-1, '.');
if(!sec_last_tld) {
/* If the '.' is at the beginning of the host there
* has to be at least 3 characters in the TLD for it
* to be valid.
* Ex: .com -> .com as the domain name.
* .co -> has no domain name.
*/
if(last_tld-host == 0) {
if(end-(last_tld-1) < 3)
return;
} else if(last_tld-host == 3) {
DWORD i;
/* If there's three characters in front of last_tld and
* they are on the list of recognized TLDs, then this
* host doesn't have a domain (since the host only contains
* a TLD name.
* Ex: edu.uk -> has no domain name.
* foo.uk -> foo.uk as the domain name.
*/
for(i = 0; i < sizeof(recognized_tlds)/sizeof(recognized_tlds[0]); ++i) {
if(!StrCmpNIW(host, recognized_tlds[i].tld_name, 3))
return;
}
} else if(last_tld-host < 3)
/* Anything less then 3 characters is considered part
* of the TLD name.
* Ex: ak.uk -> Has no domain name.
*/
return;
/* Otherwise the domain name is the whole host name. */
*domain_start = 0;
} else if(end+1-last_tld > 3) {
/* If the last_tld has more then 3 characters then it's automatically
* considered the TLD of the domain name.
* Ex: www.winehq.org.uk.test -> uk.test as the domain name.
*/
*domain_start = (sec_last_tld+1)-host;
} else if(last_tld - (sec_last_tld+1) < 4) {
DWORD i;
/* If the sec_last_tld is 3 characters long it HAS to be on the list of
* recognized to still be considered part of the TLD name, otherwise
* its considered the domain name.
* Ex: www.google.com.uk -> google.com.uk as the domain name.
* www.google.foo.uk -> foo.uk as the domain name.
*/
if(last_tld - (sec_last_tld+1) == 3) {
for(i = 0; i < sizeof(recognized_tlds)/sizeof(recognized_tlds[0]); ++i) {
if(!StrCmpNIW(sec_last_tld+1, recognized_tlds[i].tld_name, 3)) {
const WCHAR *domain = str_last_of(host, sec_last_tld-1, '.');
if(!domain)
*domain_start = 0;
else
*domain_start = (domain+1) - host;
TRACE("Found domain name %s\n", debugstr_wn(host+*domain_start,
(host+host_len)-(host+*domain_start)));
return;
}
}
*domain_start = (sec_last_tld+1)-host;
} else {
/* Since the sec_last_tld is less then 3 characters it's considered
* part of the TLD.
* Ex: www.google.fo.uk -> google.fo.uk as the domain name.
*/
const WCHAR *domain = str_last_of(host, sec_last_tld-1, '.');
if(!domain)
*domain_start = 0;
else
*domain_start = (domain+1) - host;
}
} else {
/* The second to last TLD has more then 3 characters making it
* the domain name.
* Ex: www.google.test.us -> test.us as the domain name.
*/
*domain_start = (sec_last_tld+1)-host;
}
TRACE("Found domain name %s\n", debugstr_wn(host+*domain_start,
(host+host_len)-(host+*domain_start)));
}
/* Computes the location where the elision should occur in the IPv6
* address using the numerical values of each component stored in
* 'values'. If the address shouldn't contain an elision then 'index'
@ -1593,6 +1742,10 @@ static BOOL canonicalize_reg_name(const parse_data *data, Uri *uri,
computeOnly, debugstr_wn(uri->canon_uri+uri->host_start, uri->host_len),
uri->host_len);
if(!computeOnly)
find_domain_name(uri->canon_uri+uri->host_start, uri->host_len,
&(uri->domain_offset));
return TRUE;
}