diff --git a/dlls/ntdll/locale.c b/dlls/ntdll/locale.c index 508de48a0f9..d1b7ec83809 100644 --- a/dlls/ntdll/locale.c +++ b/dlls/ntdll/locale.c @@ -687,7 +687,8 @@ static NTSTATUS open_nls_data_file( ULONG type, ULONG id, HANDLE *file ) void init_unix_codepage(void) { - unix_funcs->get_unix_codepage( &unix_table ); + USHORT *data = unix_funcs->get_unix_codepage_data(); + if (data) RtlInitCodePageTable( data, &unix_table ); } diff --git a/dlls/ntdll/unix/env.c b/dlls/ntdll/unix/env.c index e57b6256890..e285bbbef0d 100644 --- a/dlls/ntdll/unix/env.c +++ b/dlls/ntdll/unix/env.c @@ -73,10 +73,17 @@ char **main_argv = NULL; char **main_envp = NULL; static WCHAR **main_wargv; -static CPTABLEINFO unix_table; static char system_locale[LOCALE_NAME_MAX_LENGTH]; static char user_locale[LOCALE_NAME_MAX_LENGTH]; +static struct +{ + USHORT *data; + USHORT *dbcs; + USHORT *mbtable; + void *wctable; +} unix_cp; + static void *read_nls_file( const char *name ) { const char *dir = build_dir ? build_dir : data_dir; @@ -108,6 +115,21 @@ static void *read_nls_file( const char *name ) } +static int get_utf16( const WCHAR *src, unsigned int srclen, unsigned int *ch ) +{ + if (IS_HIGH_SURROGATE( src[0] )) + { + if (srclen <= 1) return 0; + if (!IS_LOW_SURROGATE( src[1] )) return 0; + *ch = 0x10000 + ((src[0] & 0x3ff) << 10) + (src[1] & 0x3ff); + return 2; + } + if (IS_LOW_SURROGATE( src[0] )) return 0; + *ch = src[0]; + return 1; +} + + #ifdef __APPLE__ /* The Apple filesystem enforces NFD so we need the compose tables to put it back into NFC */ @@ -148,20 +170,6 @@ static void init_unix_codepage(void) nfc_table = read_nls_file( "normnfc" ); } -static int get_utf16( const WCHAR *src, unsigned int srclen, unsigned int *ch ) -{ - if (IS_HIGH_SURROGATE( src[0] )) - { - if (srclen <= 1) return 0; - if (!IS_LOW_SURROGATE( src[1] )) return 0; - *ch = 0x10000 + ((src[0] & 0x3ff) << 10) + (src[1] & 0x3ff); - return 2; - } - if (IS_LOW_SURROGATE( src[0] )) return 0; - *ch = src[0]; - return 1; -} - static void put_utf16( WCHAR *dst, unsigned int ch ) { if (ch >= 0x10000) @@ -352,6 +360,17 @@ static const struct { const char *name; UINT cp; } charset_names[] = { "UTF8", CP_UTF8 } }; +static void init_unix_cptable( USHORT *ptr ) +{ + unix_cp.data = ptr; + ptr += ptr[0]; + unix_cp.wctable = ptr + ptr[0] + 1; + unix_cp.mbtable = ++ptr; + ptr += 256; + if (*ptr++) ptr += 256; /* glyph table */ + if (*ptr) unix_cp.dbcs = ptr + 1; /* dbcs ranges */ +} + static void init_unix_codepage(void) { char charset_name[16]; @@ -383,7 +402,7 @@ static void init_unix_codepage(void) void *data; sprintf( name, "c_%03u", charset_names[pos].cp ); - if ((data = read_nls_file( name ))) RtlInitCodePageTable( data, &unix_table ); + if ((data = read_nls_file( name ))) init_unix_cptable( data ); } return; } @@ -422,6 +441,58 @@ static inline BOOL is_special_env_var( const char *var ) } +static unsigned int decode_utf8_char( unsigned char ch, const char **str, const char *strend ) +{ + /* number of following bytes in sequence based on first byte value (for bytes above 0x7f) */ + static const char utf8_length[128] = + { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8f */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9f */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0-0xaf */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0-0xbf */ + 0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc0-0xcf */ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xd0-0xdf */ + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xe0-0xef */ + 3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0 /* 0xf0-0xff */ + }; + + /* first byte mask depending on UTF-8 sequence length */ + static const unsigned char utf8_mask[4] = { 0x7f, 0x1f, 0x0f, 0x07 }; + + unsigned int len = utf8_length[ch - 0x80]; + unsigned int res = ch & utf8_mask[len]; + const char *end = *str + len; + + if (end > strend) + { + *str = end; + return ~0; + } + switch (len) + { + case 3: + if ((ch = end[-3] ^ 0x80) >= 0x40) break; + res = (res << 6) | ch; + (*str)++; + if (res < 0x10) break; + case 2: + if ((ch = end[-2] ^ 0x80) >= 0x40) break; + res = (res << 6) | ch; + if (res >= 0x110000 >> 6) break; + (*str)++; + if (res < 0x20) break; + if (res >= 0xd800 >> 6 && res <= 0xdfff >> 6) break; + case 1: + if ((ch = end[-1] ^ 0x80) >= 0x40) break; + res = (res << 6) | ch; + (*str)++; + if (res < 0x80) break; + return res; + } + return ~0; +} + + /****************************************************************** * ntdll_umbstowcs */ @@ -429,15 +500,66 @@ DWORD ntdll_umbstowcs( const char *src, DWORD srclen, WCHAR *dst, DWORD dstlen ) { DWORD reslen; - if (unix_table.CodePage) - RtlCustomCPToUnicodeN( &unix_table, dst, dstlen * sizeof(WCHAR), &reslen, src, srclen ); - else - RtlUTF8ToUnicodeN( dst, dstlen * sizeof(WCHAR), &reslen, src, srclen ); + if (unix_cp.data) + { + DWORD i; - reslen /= sizeof(WCHAR); + if (unix_cp.dbcs) + { + for (i = dstlen; srclen && i; i--, srclen--, src++, dst++) + { + USHORT off = unix_cp.dbcs[(unsigned char)*src]; + if (off && srclen > 1) + { + src++; + srclen--; + *dst = unix_cp.dbcs[off + (unsigned char)*src]; + } + else *dst = unix_cp.mbtable[(unsigned char)*src]; + } + reslen = dstlen - i; + } + else + { + reslen = min( srclen, dstlen ); + for (i = 0; i < reslen; i++) dst[i] = unix_cp.mbtable[(unsigned char)src[i]]; + } + } + else /* utf-8 */ + { + unsigned int res; + const char *srcend = src + srclen; + WCHAR *dstend = dst + dstlen; + + while ((dst < dstend) && (src < srcend)) + { + unsigned char ch = *src++; + if (ch < 0x80) /* special fast case for 7-bit ASCII */ + { + *dst++ = ch; + continue; + } + if ((res = decode_utf8_char( ch, &src, srcend )) <= 0xffff) + { + *dst++ = res; + } + else if (res <= 0x10ffff) /* we need surrogates */ + { + res -= 0x10000; + *dst++ = 0xd800 | (res >> 10); + if (dst == dstend) break; + *dst++ = 0xdc00 | (res & 0x3ff); + } + else + { + *dst++ = 0xfffd; + } + } + reslen = dstlen - (dstend - dst); #ifdef __APPLE__ /* work around broken Mac OS X filesystem that enforces NFD */ - if (reslen && nfc_table) reslen = compose_string( nfc_table, dst, reslen ); + if (reslen && nfc_table) reslen = compose_string( nfc_table, dst, reslen ); #endif + } return reslen; } @@ -449,25 +571,24 @@ int ntdll_wcstoumbs( const WCHAR *src, DWORD srclen, char *dst, DWORD dstlen, BO { DWORD i, reslen; - if (unix_table.CodePage) + if (unix_cp.data) { - if (unix_table.DBCSOffsets) + if (unix_cp.dbcs) { - const unsigned short *uni2cp = unix_table.WideCharTable; + const unsigned short *uni2cp = unix_cp.wctable; for (i = dstlen; srclen && i; i--, srclen--, src++) { unsigned short ch = uni2cp[*src]; if (ch >> 8) { - if (strict && unix_table.DBCSOffsets[unix_table.DBCSOffsets[ch >> 8] + (ch & 0xff)] != *src) - return -1; + if (strict && unix_cp.dbcs[unix_cp.dbcs[ch >> 8] + (ch & 0xff)] != *src) return -1; if (i == 1) break; /* do not output a partial char */ i--; *dst++ = ch >> 8; } else { - if (unix_table.MultiByteTable[ch] != *src) return -1; + if (unix_cp.mbtable[ch] != *src) return -1; *dst++ = (char)ch; } } @@ -475,18 +596,72 @@ int ntdll_wcstoumbs( const WCHAR *src, DWORD srclen, char *dst, DWORD dstlen, BO } else { - const unsigned char *uni2cp = unix_table.WideCharTable; + const unsigned char *uni2cp = unix_cp.wctable; reslen = min( srclen, dstlen ); for (i = 0; i < reslen; i++) { unsigned char ch = uni2cp[src[i]]; - if (strict && unix_table.MultiByteTable[ch] != src[i]) return -1; + if (strict && unix_cp.mbtable[ch] != src[i]) return -1; dst[i] = ch; } } } - else RtlUnicodeToUTF8N( dst, dstlen, &reslen, src, srclen * sizeof(WCHAR) ); + else /* utf-8 */ + { + char *end; + unsigned int val; + for (end = dst + dstlen; srclen; srclen--, src++) + { + WCHAR ch = *src; + + if (ch < 0x80) /* 0x00-0x7f: 1 byte */ + { + if (dst > end - 1) break; + *dst++ = ch; + continue; + } + if (ch < 0x800) /* 0x80-0x7ff: 2 bytes */ + { + if (dst > end - 2) break; + dst[1] = 0x80 | (ch & 0x3f); + ch >>= 6; + dst[0] = 0xc0 | ch; + dst += 2; + continue; + } + if (!get_utf16( src, srclen, &val )) + { + if (strict) return -1; + val = 0xfffd; + } + if (val < 0x10000) /* 0x800-0xffff: 3 bytes */ + { + if (dst > end - 3) break; + dst[2] = 0x80 | (val & 0x3f); + val >>= 6; + dst[1] = 0x80 | (val & 0x3f); + val >>= 6; + dst[0] = 0xe0 | val; + dst += 3; + } + else /* 0x10000-0x10ffff: 4 bytes */ + { + if (dst > end - 4) break; + dst[3] = 0x80 | (val & 0x3f); + val >>= 6; + dst[2] = 0x80 | (val & 0x3f); + val >>= 6; + dst[1] = 0x80 | (val & 0x3f); + val >>= 6; + dst[0] = 0xf0 | val; + dst += 4; + src++; + srclen--; + } + } + reslen = dstlen - (end - dst); + } return reslen; } @@ -1055,13 +1230,13 @@ void CDECL get_initial_directory( UNICODE_STRING *dir ) /************************************************************************* - * get_unix_codepage + * get_unix_codepage_data * * Return the Unix codepage data. */ -void CDECL get_unix_codepage( CPTABLEINFO *table ) +USHORT * CDECL get_unix_codepage_data(void) { - *table = unix_table; + return unix_cp.data; } diff --git a/dlls/ntdll/unix/loader.c b/dlls/ntdll/unix/loader.c index 3ea7adf9f0a..a8af46e2dfd 100644 --- a/dlls/ntdll/unix/loader.c +++ b/dlls/ntdll/unix/loader.c @@ -1509,7 +1509,7 @@ static struct unix_funcs unix_funcs = get_dynamic_environment, get_initial_console, get_initial_directory, - get_unix_codepage, + get_unix_codepage_data, get_locales, get_version, get_build_id, diff --git a/dlls/ntdll/unix/unix_private.h b/dlls/ntdll/unix/unix_private.h index 71d6ee7d4f2..4f93aa2c54b 100644 --- a/dlls/ntdll/unix/unix_private.h +++ b/dlls/ntdll/unix/unix_private.h @@ -100,7 +100,7 @@ extern NTSTATUS CDECL get_startup_info( startup_info_t *info, SIZE_T *total_size extern NTSTATUS CDECL get_dynamic_environment( WCHAR *env, SIZE_T *size ) DECLSPEC_HIDDEN; extern void CDECL get_initial_directory( UNICODE_STRING *dir ) DECLSPEC_HIDDEN; extern void CDECL get_initial_console( HANDLE *handle, HANDLE *std_in, HANDLE *std_out, HANDLE *std_err ) DECLSPEC_HIDDEN; -extern void CDECL get_unix_codepage( CPTABLEINFO *table ) DECLSPEC_HIDDEN; +extern USHORT * CDECL get_unix_codepage_data(void) DECLSPEC_HIDDEN; extern void CDECL get_locales( WCHAR *sys, WCHAR *user ) DECLSPEC_HIDDEN; extern NTSTATUS CDECL virtual_map_section( HANDLE handle, PVOID *addr_ptr, unsigned short zero_bits_64, SIZE_T commit_size, const LARGE_INTEGER *offset_ptr, SIZE_T *size_ptr, ULONG alloc_type, diff --git a/dlls/ntdll/unixlib.h b/dlls/ntdll/unixlib.h index 7e73400dae2..89ca0e960ce 100644 --- a/dlls/ntdll/unixlib.h +++ b/dlls/ntdll/unixlib.h @@ -28,7 +28,7 @@ struct msghdr; struct _DISPATCHER_CONTEXT; /* increment this when you change the function table */ -#define NTDLL_UNIXLIB_VERSION 69 +#define NTDLL_UNIXLIB_VERSION 70 struct unix_funcs { @@ -303,7 +303,7 @@ struct unix_funcs void (CDECL *get_initial_console)( HANDLE *handle, HANDLE *std_in, HANDLE *std_out, HANDLE *std_err ); void (CDECL *get_initial_directory)( UNICODE_STRING *dir ); - void (CDECL *get_unix_codepage)( CPTABLEINFO *table ); + USHORT * (CDECL *get_unix_codepage_data)(void); void (CDECL *get_locales)( WCHAR *sys, WCHAR *user ); const char * (CDECL *get_version)(void); const char * (CDECL *get_build_id)(void);