From 41513ef243198d39a9926407faa82693d1917d2d Mon Sep 17 00:00:00 2001 From: Thomas Mullaly Date: Fri, 16 Jul 2010 11:16:56 -0400 Subject: [PATCH] urlmon: Implemented function for finding the domain name of a URI. --- dlls/urlmon/tests/uri.c | 291 ++++++++++++++++++++++++++++++++++++++++ dlls/urlmon/uri.c | 153 +++++++++++++++++++++ 2 files changed, 444 insertions(+) diff --git a/dlls/urlmon/tests/uri.c b/dlls/urlmon/tests/uri.c index a419be0cd0c..49fb941a6ea 100644 --- a/dlls/urlmon/tests/uri.c +++ b/dlls/urlmon/tests/uri.c @@ -1926,6 +1926,297 @@ static const uri_properties uri_tests[] = { {URL_SCHEME_HTTP,S_OK,FALSE}, {URLZONE_INVALID,E_NOTIMPL,FALSE} } + }, + { "http://google.com.uk", 0, S_OK, FALSE, + Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_DOMAIN| + Uri_HAS_HOST|Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME| + Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME, + TRUE, + { + {"http://google.com.uk/",S_OK,TRUE}, + {"google.com.uk",S_OK,FALSE}, + {"http://google.com.uk/",S_OK,TRUE}, + {"google.com.uk",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"",S_FALSE,TRUE}, + {"google.com.uk",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"/",S_OK,TRUE}, + {"/",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"http://google.com.uk",S_OK,FALSE}, + {"http",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,FALSE} + }, + { + {Uri_HOST_DNS,S_OK,FALSE}, + {80,S_OK,FALSE}, + {URL_SCHEME_HTTP,S_OK,FALSE}, + {URLZONE_INVALID,E_NOTIMPL,FALSE} + } + }, + { "http://google.com.com", 0, S_OK, FALSE, + Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_DOMAIN| + Uri_HAS_HOST|Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME| + Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME, + TRUE, + { + {"http://google.com.com/",S_OK,TRUE}, + {"google.com.com",S_OK,FALSE}, + {"http://google.com.com/",S_OK,TRUE}, + {"com.com",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"",S_FALSE,TRUE}, + {"google.com.com",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"/",S_OK,TRUE}, + {"/",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"http://google.com.com",S_OK,FALSE}, + {"http",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,FALSE} + }, + { + {Uri_HOST_DNS,S_OK,FALSE}, + {80,S_OK,FALSE}, + {URL_SCHEME_HTTP,S_OK,FALSE}, + {URLZONE_INVALID,E_NOTIMPL,FALSE} + } + }, + { "http://google.uk.1", 0, S_OK, FALSE, + Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_DOMAIN| + Uri_HAS_HOST|Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME| + Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME, + TRUE, + { + {"http://google.uk.1/",S_OK,TRUE}, + {"google.uk.1",S_OK,FALSE}, + {"http://google.uk.1/",S_OK,TRUE}, + {"google.uk.1",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"",S_FALSE,TRUE}, + {"google.uk.1",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"/",S_OK,TRUE}, + {"/",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"http://google.uk.1",S_OK,FALSE}, + {"http",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,FALSE} + }, + { + {Uri_HOST_DNS,S_OK,FALSE}, + {80,S_OK,FALSE}, + {URL_SCHEME_HTTP,S_OK,FALSE}, + {URLZONE_INVALID,E_NOTIMPL,FALSE} + } + }, + /* Since foo isn't a recognized 3 character TLD its considered the domain name. */ + { "http://google.foo.uk", 0, S_OK, FALSE, + Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_DOMAIN| + Uri_HAS_HOST|Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME| + Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME, + TRUE, + { + {"http://google.foo.uk/",S_OK,TRUE}, + {"google.foo.uk",S_OK,FALSE}, + {"http://google.foo.uk/",S_OK,TRUE}, + {"foo.uk",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"",S_FALSE,TRUE}, + {"google.foo.uk",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"/",S_OK,TRUE}, + {"/",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"http://google.foo.uk",S_OK,FALSE}, + {"http",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,FALSE} + }, + { + {Uri_HOST_DNS,S_OK,FALSE}, + {80,S_OK,FALSE}, + {URL_SCHEME_HTTP,S_OK,FALSE}, + {URLZONE_INVALID,E_NOTIMPL,FALSE} + } + }, + { "http://.com", 0, S_OK, FALSE, + Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_DOMAIN| + Uri_HAS_HOST|Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME| + Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME, + TRUE, + { + {"http://.com/",S_OK,TRUE}, + {".com",S_OK,FALSE}, + {"http://.com/",S_OK,TRUE}, + {".com",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"",S_FALSE,TRUE}, + {".com",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"/",S_OK,TRUE}, + {"/",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"http://.com",S_OK,FALSE}, + {"http",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,FALSE} + }, + { + {Uri_HOST_DNS,S_OK,FALSE}, + {80,S_OK,FALSE}, + {URL_SCHEME_HTTP,S_OK,FALSE}, + {URLZONE_INVALID,E_NOTIMPL,FALSE} + } + }, + { "http://.uk", 0, S_OK, FALSE, + Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_HOST| + Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME| + Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME, + TRUE, + { + {"http://.uk/",S_OK,TRUE}, + {".uk",S_OK,FALSE}, + {"http://.uk/",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"",S_FALSE,TRUE}, + {"",S_FALSE,TRUE}, + {".uk",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"/",S_OK,TRUE}, + {"/",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"http://.uk",S_OK,FALSE}, + {"http",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,FALSE} + }, + { + {Uri_HOST_DNS,S_OK,FALSE}, + {80,S_OK,FALSE}, + {URL_SCHEME_HTTP,S_OK,FALSE}, + {URLZONE_INVALID,E_NOTIMPL,FALSE} + } + }, + { "http://www.co.google.com.[]", 0, S_OK, FALSE, + Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_DOMAIN| + Uri_HAS_HOST|Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME| + Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME, + TRUE, + { + {"http://www.co.google.com.[]/",S_OK,TRUE}, + {"www.co.google.com.[]",S_OK,FALSE}, + {"http://www.co.google.com.[]/",S_OK,TRUE}, + {"google.com.[]",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"",S_FALSE,TRUE}, + {"www.co.google.com.[]",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"/",S_OK,TRUE}, + {"/",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"http://www.co.google.com.[]",S_OK,FALSE}, + {"http",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,FALSE} + }, + { + {Uri_HOST_DNS,S_OK,FALSE}, + {80,S_OK,FALSE}, + {URL_SCHEME_HTTP,S_OK,FALSE}, + {URLZONE_INVALID,E_NOTIMPL,FALSE} + } + }, + { "http://co.uk", 0, S_OK, FALSE, + Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_HOST| + Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME| + Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME, + TRUE, + { + {"http://co.uk/",S_OK,TRUE}, + {"co.uk",S_OK,FALSE}, + {"http://co.uk/",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"",S_FALSE,TRUE}, + {"",S_FALSE,TRUE}, + {"co.uk",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"/",S_OK,TRUE}, + {"/",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"http://co.uk",S_OK,FALSE}, + {"http",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,FALSE} + }, + { + {Uri_HOST_DNS,S_OK,FALSE}, + {80,S_OK,FALSE}, + {URL_SCHEME_HTTP,S_OK,FALSE}, + {URLZONE_INVALID,E_NOTIMPL,FALSE} + } + }, + { "http://www.co.google.us.test", 0, S_OK, FALSE, + Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_DOMAIN| + Uri_HAS_HOST|Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME| + Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME, + TRUE, + { + {"http://www.co.google.us.test/",S_OK,TRUE}, + {"www.co.google.us.test",S_OK,FALSE}, + {"http://www.co.google.us.test/",S_OK,TRUE}, + {"us.test",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"",S_FALSE,TRUE}, + {"www.co.google.us.test",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"/",S_OK,TRUE}, + {"/",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"http://www.co.google.us.test",S_OK,FALSE}, + {"http",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,FALSE} + }, + { + {Uri_HOST_DNS,S_OK,FALSE}, + {80,S_OK,FALSE}, + {URL_SCHEME_HTTP,S_OK,FALSE}, + {URLZONE_INVALID,E_NOTIMPL,FALSE} + } + }, + { "http://gov.uk", 0, S_OK, FALSE, + Uri_HAS_ABSOLUTE_URI|Uri_HAS_AUTHORITY|Uri_HAS_DISPLAY_URI|Uri_HAS_HOST| + Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME| + Uri_HAS_HOST_TYPE|Uri_HAS_PORT|Uri_HAS_SCHEME, + TRUE, + { + {"http://gov.uk/",S_OK,TRUE}, + {"gov.uk",S_OK,FALSE}, + {"http://gov.uk/",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"",S_FALSE,TRUE}, + {"",S_FALSE,TRUE}, + {"gov.uk",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"/",S_OK,TRUE}, + {"/",S_OK,TRUE}, + {"",S_FALSE,TRUE}, + {"http://gov.uk",S_OK,FALSE}, + {"http",S_OK,FALSE}, + {"",S_FALSE,FALSE}, + {"",S_FALSE,FALSE} + }, + { + {Uri_HOST_DNS,S_OK,FALSE}, + {80,S_OK,FALSE}, + {URL_SCHEME_HTTP,S_OK,FALSE}, + {URLZONE_INVALID,E_NOTIMPL,FALSE} + } } }; diff --git a/dlls/urlmon/uri.c b/dlls/urlmon/uri.c index be6f89ccc88..fc9ebfe899d 100644 --- a/dlls/urlmon/uri.c +++ b/dlls/urlmon/uri.c @@ -56,6 +56,8 @@ typedef struct { INT authority_start; DWORD authority_len; + + INT domain_offset; } Uri; typedef struct { @@ -158,6 +160,21 @@ static const struct { {URL_SCHEME_HTTPS, 443}, }; +/* List of 3 character top level domain names Windows seems to recognize. + * There might be more, but, these are the only ones I've found so far. + */ +static const struct { + WCHAR tld_name[4]; +} recognized_tlds[] = { + {{'c','o','m',0}}, + {{'e','d','u',0}}, + {{'g','o','v',0}}, + {{'i','n','t',0}}, + {{'m','i','l',0}}, + {{'n','e','t',0}}, + {{'o','r','g',0}} +}; + static inline BOOL is_alpha(WCHAR val) { return ((val >= 'a' && val <= 'z') || (val >= 'A' && val <= 'Z')); } @@ -312,6 +329,138 @@ static inline void pct_encode_val(WCHAR val, WCHAR *dest) { dest[2] = hexDigits[val & 0xf]; } +/* Scans the range of characters [str, end] and returns the last occurence + * of 'ch' or returns NULL. + */ +static const WCHAR *str_last_of(const WCHAR *str, const WCHAR *end, WCHAR ch) { + const WCHAR *ptr = end; + + while(ptr >= str) { + if(*ptr == ch) + return ptr; + --ptr; + } + + return NULL; +} + +/* Attempts to parse the domain name from the host. + * + * This function also includes the Top-level Domain (TLD) name + * of the host when it tries to find the domain name. If it finds + * a valid domain name it will assign 'domain_start' the offset + * into 'host' where the domain name starts. + * + * It's implied that if a domain name is found that it goes + * from [host+domain_start, host+host_len). + */ +static void find_domain_name(const WCHAR *host, DWORD host_len, + INT *domain_start) { + const WCHAR *last_tld, *sec_last_tld, *end; + + end = host+host_len-1; + + *domain_start = -1; + + /* There has to be at least enough room for a '.' followed by a + * 3 character TLD for a domain to even exist in the host name. + */ + if(host_len < 4) + return; + + last_tld = str_last_of(host, end, '.'); + if(!last_tld) + /* http://hostname -> has no domain name. */ + return; + + sec_last_tld = str_last_of(host, last_tld-1, '.'); + if(!sec_last_tld) { + /* If the '.' is at the beginning of the host there + * has to be at least 3 characters in the TLD for it + * to be valid. + * Ex: .com -> .com as the domain name. + * .co -> has no domain name. + */ + if(last_tld-host == 0) { + if(end-(last_tld-1) < 3) + return; + } else if(last_tld-host == 3) { + DWORD i; + + /* If there's three characters in front of last_tld and + * they are on the list of recognized TLDs, then this + * host doesn't have a domain (since the host only contains + * a TLD name. + * Ex: edu.uk -> has no domain name. + * foo.uk -> foo.uk as the domain name. + */ + for(i = 0; i < sizeof(recognized_tlds)/sizeof(recognized_tlds[0]); ++i) { + if(!StrCmpNIW(host, recognized_tlds[i].tld_name, 3)) + return; + } + } else if(last_tld-host < 3) + /* Anything less then 3 characters is considered part + * of the TLD name. + * Ex: ak.uk -> Has no domain name. + */ + return; + + /* Otherwise the domain name is the whole host name. */ + *domain_start = 0; + } else if(end+1-last_tld > 3) { + /* If the last_tld has more then 3 characters then it's automatically + * considered the TLD of the domain name. + * Ex: www.winehq.org.uk.test -> uk.test as the domain name. + */ + *domain_start = (sec_last_tld+1)-host; + } else if(last_tld - (sec_last_tld+1) < 4) { + DWORD i; + /* If the sec_last_tld is 3 characters long it HAS to be on the list of + * recognized to still be considered part of the TLD name, otherwise + * its considered the domain name. + * Ex: www.google.com.uk -> google.com.uk as the domain name. + * www.google.foo.uk -> foo.uk as the domain name. + */ + if(last_tld - (sec_last_tld+1) == 3) { + for(i = 0; i < sizeof(recognized_tlds)/sizeof(recognized_tlds[0]); ++i) { + if(!StrCmpNIW(sec_last_tld+1, recognized_tlds[i].tld_name, 3)) { + const WCHAR *domain = str_last_of(host, sec_last_tld-1, '.'); + + if(!domain) + *domain_start = 0; + else + *domain_start = (domain+1) - host; + TRACE("Found domain name %s\n", debugstr_wn(host+*domain_start, + (host+host_len)-(host+*domain_start))); + return; + } + } + + *domain_start = (sec_last_tld+1)-host; + } else { + /* Since the sec_last_tld is less then 3 characters it's considered + * part of the TLD. + * Ex: www.google.fo.uk -> google.fo.uk as the domain name. + */ + const WCHAR *domain = str_last_of(host, sec_last_tld-1, '.'); + + if(!domain) + *domain_start = 0; + else + *domain_start = (domain+1) - host; + } + } else { + /* The second to last TLD has more then 3 characters making it + * the domain name. + * Ex: www.google.test.us -> test.us as the domain name. + */ + *domain_start = (sec_last_tld+1)-host; + } + + TRACE("Found domain name %s\n", debugstr_wn(host+*domain_start, + (host+host_len)-(host+*domain_start))); +} + /* Computes the location where the elision should occur in the IPv6 * address using the numerical values of each component stored in * 'values'. If the address shouldn't contain an elision then 'index' @@ -1593,6 +1742,10 @@ static BOOL canonicalize_reg_name(const parse_data *data, Uri *uri, computeOnly, debugstr_wn(uri->canon_uri+uri->host_start, uri->host_len), uri->host_len); + if(!computeOnly) + find_domain_name(uri->canon_uri+uri->host_start, uri->host_len, + &(uri->domain_offset)); + return TRUE; }