From aea78538c8687f0649c1fc6a9e03b377a0f73c39 Mon Sep 17 00:00:00 2001 From: Alexandre Julliard Date: Fri, 11 Aug 2000 00:44:33 +0000 Subject: [PATCH] Added UTF-8 conversion support. --- include/wine/unicode.h | 2 + memory/codepage.c | 67 ++++++++------- unicode/Makefile.in | 1 + unicode/utf8.c | 187 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 228 insertions(+), 29 deletions(-) create mode 100644 unicode/utf8.c diff --git a/include/wine/unicode.h b/include/wine/unicode.h index baac36608b7..27223ae0a2a 100644 --- a/include/wine/unicode.h +++ b/include/wine/unicode.h @@ -53,6 +53,8 @@ extern int cp_mbstowcs( const union cptable *table, int flags, extern int cp_wcstombs( const union cptable *table, int flags, const WCHAR *src, int srclen, char *dst, int dstlen, const char *defchar, int *used ); +extern int utf8_wcstombs( const WCHAR *src, int srclen, char *dst, int dstlen ); +extern int utf8_mbstowcs( int flags, const char *src, int srclen, WCHAR *dst, int dstlen ); static inline int is_dbcs_leadbyte( const union cptable *table, unsigned char ch ) { diff --git a/memory/codepage.c b/memory/codepage.c index ae86d50ba32..0223618add9 100644 --- a/memory/codepage.c +++ b/memory/codepage.c @@ -254,23 +254,27 @@ INT WINAPI MultiByteToWideChar( UINT page, DWORD flags, LPCSTR src, INT srclen, if (srclen == -1) srclen = strlen(src) + 1; - if (page >= CP_UTF7) - { - FIXME("UTF not supported\n"); - SetLastError( ERROR_CALL_NOT_IMPLEMENTED ); - return 0; - } - - if (!(table = get_codepage_table( page ))) - { - SetLastError( ERROR_INVALID_PARAMETER ); - return 0; - } - if (flags & MB_COMPOSITE) FIXME("MB_COMPOSITE not supported\n"); if (flags & MB_USEGLYPHCHARS) FIXME("MB_USEGLYPHCHARS not supported\n"); - ret = cp_mbstowcs( table, flags, src, srclen, dst, dstlen ); + switch(page) + { + case CP_UTF7: + FIXME("UTF not supported\n"); + SetLastError( ERROR_CALL_NOT_IMPLEMENTED ); + return 0; + case CP_UTF8: + ret = utf8_mbstowcs( flags, src, srclen, dst, dstlen ); + break; + default: + if (!(table = get_codepage_table( page ))) + { + SetLastError( ERROR_INVALID_PARAMETER ); + return 0; + } + ret = cp_mbstowcs( table, flags, src, srclen, dst, dstlen ); + break; + } if (ret < 0) { @@ -326,23 +330,28 @@ INT WINAPI WideCharToMultiByte( UINT page, DWORD flags, LPCWSTR src, INT srclen, if (srclen == -1) srclen = strlenW(src) + 1; - if (page >= CP_UTF7) - { - FIXME("UTF not supported\n"); - SetLastError( ERROR_CALL_NOT_IMPLEMENTED ); - return 0; - } - - if (!(table = get_codepage_table( page ))) - { - SetLastError( ERROR_INVALID_PARAMETER ); - return 0; - } - /* if (flags & WC_COMPOSITECHECK) FIXME( "WC_COMPOSITECHECK (%lx) not supported\n", flags );*/ - ret = cp_wcstombs( table, flags, src, srclen, dst, dstlen, defchar, used ? &used_tmp : NULL ); - if (used) *used = used_tmp; + switch(page) + { + case CP_UTF7: + FIXME("UTF-7 not supported\n"); + SetLastError( ERROR_CALL_NOT_IMPLEMENTED ); + return 0; + case CP_UTF8: + ret = utf8_wcstombs( src, srclen, dst, dstlen ); + break; + default: + if (!(table = get_codepage_table( page ))) + { + SetLastError( ERROR_INVALID_PARAMETER ); + return 0; + } + ret = cp_wcstombs( table, flags, src, srclen, dst, dstlen, + defchar, used ? &used_tmp : NULL ); + if (used) *used = used_tmp; + break; + } if (ret == -1) { diff --git a/unicode/Makefile.in b/unicode/Makefile.in index 10be462cf6d..22f77f4ebfd 100644 --- a/unicode/Makefile.in +++ b/unicode/Makefile.in @@ -70,6 +70,7 @@ C_SRCS = \ cptable.c \ mbtowc.c \ string.c \ + utf8.c \ wctomb.c \ $(CODEPAGES:%=c_%.c) diff --git a/unicode/utf8.c b/unicode/utf8.c new file mode 100644 index 00000000000..66992e67386 --- /dev/null +++ b/unicode/utf8.c @@ -0,0 +1,187 @@ +/* + * UTF-8 support routines + * + * Copyright 2000 Alexandre Julliard + */ + +#include + +#include "winnls.h" +#include "wine/unicode.h" + +/* number of following bytes in sequence based on first byte value (for bytes above 0x7f) */ +static const char utf8_length[128] = +{ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8f */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9f */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0-0xaf */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0-0xbf */ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc0-0xcf */ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xd0-0xdf */ + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xe0-0xef */ + 3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0 /* 0xf0-0xff */ +}; + +/* first byte mask depending on UTF-8 sequence length */ +static const unsigned char utf8_mask[6] = { 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; + +/* minimum Unicode value depending on UTF-8 sequence length */ +static const unsigned int utf8_minval[6] = { 0x0, 0x80, 0x800, 0x10000, 0x200000, 0x4000000 }; + + +/* query necessary dst length for src string */ +inline static int get_length_wcs_utf8( const WCHAR *src, unsigned int srclen ) +{ + int len; + for (len = 0; srclen; srclen--, src++, len++) + { + if (*src >= 0x80) + { + len++; + if (*src >= 0x800) len++; + } + } + return len; +} + +/* wide char to UTF-8 string conversion */ +/* return -1 on dst buffer overflow */ +int utf8_wcstombs( const WCHAR *src, int srclen, char *dst, int dstlen ) +{ + int ret = srclen; + + if (!dstlen) return get_length_wcs_utf8( src, srclen ); + + for (ret = srclen; srclen; srclen--, src++) + { + WCHAR ch = *src; + + if (ch < 0x80) /* 0x00-0x7f: 1 byte */ + { + if (!dstlen--) return -1; /* overflow */ + *dst++ = ch; + continue; + } + + if (ch < 0x800) /* 0x80-0x7ff: 2 bytes */ + { + if ((dstlen -= 2) < 0) return -1; /* overflow */ + dst[1] = 0x80 | (ch & 0x3f); + ch >>= 6; + dst[0] = 0xc0 | ch; + dst += 2; + continue; + } + + /* 0x800-0xffff: 3 bytes */ + + if ((dstlen -= 3) < 0) return -1; /* overflow */ + dst[2] = 0x80 | (ch & 0x3f); + ch >>= 6; + dst[1] = 0x80 | (ch & 0x3f); + ch >>= 6; + dst[0] = 0xe0 | ch; + dst += 3; + } + return ret; +} + +/* query necessary dst length for src string */ +inline static int get_length_mbs_utf8( const unsigned char *src, int srclen ) +{ + int ret; + const unsigned char *srcend = src + srclen; + + for (ret = 0; src < srcend; ret++) + { + unsigned char ch = *src++; + if (ch < 0xc0) continue; + + switch(utf8_length[ch-0x80]) + { + case 5: + if (src >= srcend) return ret; /* ignore partial char */ + if ((ch = *src ^ 0x80) >= 0x40) continue; + src++; + case 4: + if (src >= srcend) return ret; /* ignore partial char */ + if ((ch = *src ^ 0x80) >= 0x40) continue; + src++; + case 3: + if (src >= srcend) return ret; /* ignore partial char */ + if ((ch = *src ^ 0x80) >= 0x40) continue; + src++; + case 2: + if (src >= srcend) return ret; /* ignore partial char */ + if ((ch = *src ^ 0x80) >= 0x40) continue; + src++; + case 1: + if (src >= srcend) return ret; /* ignore partial char */ + if ((ch = *src ^ 0x80) >= 0x40) continue; + src++; + } + } + return ret; +} + +/* UTF-8 to wide char string conversion */ +/* return -1 on dst buffer overflow, -2 on invalid input char */ +int utf8_mbstowcs( int flags, const char *src, int srclen, WCHAR *dst, int dstlen ) +{ + int len, count; + unsigned int res; + const char *srcend = src + srclen; + + if (!dstlen) return get_length_mbs_utf8( src, srclen ); + + for (count = dstlen; count && (src < srcend); count--, dst++) + { + unsigned char ch = *src++; + if (ch < 0x80) /* special fast case for 7-bit ASCII */ + { + *dst = ch; + continue; + } + len = utf8_length[ch-0x80]; + res = ch & utf8_mask[len]; + + switch(len) + { + case 5: + if (src >= srcend) goto done; /* ignore partial char */ + if ((ch = *src ^ 0x80) >= 0x40) goto bad; + res = (res << 6) | ch; + src++; + case 4: + if (src >= srcend) goto done; /* ignore partial char */ + if ((ch = *src ^ 0x80) >= 0x40) goto bad; + res = (res << 6) | ch; + src++; + case 3: + if (src >= srcend) goto done; /* ignore partial char */ + if ((ch = *src ^ 0x80) >= 0x40) goto bad; + res = (res << 6) | ch; + src++; + case 2: + if (src >= srcend) goto done; /* ignore partial char */ + if ((ch = *src ^ 0x80) >= 0x40) goto bad; + res = (res << 6) | ch; + src++; + case 1: + if (src >= srcend) goto done; /* ignore partial char */ + if ((ch = *src ^ 0x80) >= 0x40) goto bad; + res = (res << 6) | ch; + src++; + if (res < utf8_minval[len]) goto bad; + if (res >= 0x10000) goto bad; /* FIXME: maybe we should do surrogates here */ + *dst = res; + continue; + } + bad: + if (flags & MB_ERR_INVALID_CHARS) return -2; /* bad char */ + *dst = (WCHAR)'?'; + } + if (src < srcend) return -1; /* overflow */ +done: + return dstlen - count; +}