wmc: Directly implement UTF-8 conversions.

Signed-off-by: Alexandre Julliard <julliard@winehq.org>
2020-01-29 10:39:48 +01:00 · 2020-01-29 10:39:48 +01:00 · c2f6714970
parent 88abd7cb98
commit c2f6714970
6 changed files with 155 additions and 39 deletions
--- a/tools/wmc/lang.c
+++ b/tools/wmc/lang.c
@ -222,13 +222,11 @@ int is_valid_codepage(int id)

 int wmc_mbstowcs( int codepage, int flags, const char *src, int srclen, WCHAR *dst, int dstlen )
 {
-    if (codepage == CP_UTF8) return wine_utf8_mbstowcs( flags, src, srclen, dst, dstlen );
    return wine_cp_mbstowcs( wine_cp_get_table( codepage ), flags, src, srclen, dst, dstlen );
 }

 int wmc_wcstombs( int codepage, int flags, const WCHAR *src, int srclen, char *dst, int dstlen )
 {
-    if (codepage == CP_UTF8) return wine_utf8_wcstombs( flags, src, srclen, dst, dstlen );
    return wine_cp_wcstombs( wine_cp_get_table( codepage ), flags, src, srclen, dst, dstlen, NULL, NULL );
 }

--- a/tools/wmc/mcl.c
+++ b/tools/wmc/mcl.c
@ -198,9 +198,18 @@ try_again:
 			xyyerror(err_fatalread);
 		else if(!cptr)
 			return 0;
+		if (codepage == CP_UTF8)
+		{
+			WCHAR *buf = utf8_to_unicode( xlatebuffer, strlen(xlatebuffer), &n );
+			memcpy( inputbuffer, buf, (n + 1) * sizeof(WCHAR) );
+			free( buf );
+		}
+		else
+		{
 			n = wmc_mbstowcs(codepage, 0, xlatebuffer, strlen(xlatebuffer)+1, inputbuffer, INPUTBUFFER_SIZE);
 			if(n < 0)
 				internal_error(__FILE__, __LINE__, "Could not translate to unicode (%d)\n", n);
+		}
 		if(n <= 1)
 			goto try_again;	/* Should not happen */
 		n--;	/* Strip added conversion '\0' from input length */
--- a/tools/wmc/po.c
+++ b/tools/wmc/po.c
@ -404,14 +404,6 @@ static char *get_message_context( char **msgid )

 #ifdef HAVE_LIBGETTEXTPO

-static char *convert_string_utf8( const lanmsg_t *msg )
-{
-    char *buffer = xmalloc( msg->len * 4 + 1 );
-    int len = wmc_wcstombs( CP_UTF8, 0, msg->msg, msg->len, buffer, msg->len * 4 );
-    buffer[len] = 0;
-    return buffer;
-}
-
 static po_message_t find_message( po_file_t po, const char *msgid, const char *msgctxt,
                                  po_message_iterator_t *iterator )
 {
@ -467,7 +459,8 @@ static void add_po_string( po_file_t po, const lanmsg_t *msgid, const lanmsg_t *

    if (msgstr)
    {
-        str_buffer = str = convert_string_utf8( msgstr );
+        int len;
+        str_buffer = str = unicode_to_utf8( msgstr->msg, msgstr->len, &len );
        if (is_english( msgstr->lan )) get_message_context( &str );
    }
    if (!(msg = find_message( po, id, context, &iterator )))
@ -644,7 +637,6 @@ static lanmsg_t *translate_string( lanmsg_t *str, int lang, int *found )
 {
    lanmsg_t *new;
    const char *transl;
-    int res;
    char *buffer, *msgid, *context;

    if (str->len <= 1 || !(buffer = convert_msgid_ascii( str, 0 ))) return str;
@ -658,11 +650,7 @@ static lanmsg_t *translate_string( lanmsg_t *str, int lang, int *found )
    new->cp   = 0;  /* FIXME */
    new->file = str->file;
    new->line = str->line;
-    new->len  = wmc_mbstowcs( CP_UTF8, 0, transl, strlen(transl) + 1, NULL, 0 );
-    new->msg  = xmalloc( new->len * sizeof(WCHAR) );
-    res = wmc_mbstowcs( CP_UTF8, MB_ERR_INVALID_CHARS, transl, strlen(transl) + 1, new->msg, new->len );
-    if (res == -2)
-        error( "Invalid utf-8 character in string '%s'\n", transl );
+    new->msg  = utf8_to_unicode( transl, strlen(transl) + 1, &new->len );
    free( buffer );
    return new;
 }
--- a/tools/wmc/utils.c
+++ b/tools/wmc/utils.c
@ -272,6 +272,127 @@ int unistrcmp(const WCHAR *s1, const WCHAR *s2)
 	return *s1 - *s2;
 }

+WCHAR *utf8_to_unicode( const char *src, int srclen, int *dstlen )
+{
+    static const char utf8_length[128] =
+    {
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8f */
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9f */
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0-0xaf */
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0-0xbf */
+        0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc0-0xcf */
+        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xd0-0xdf */
+        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xe0-0xef */
+        3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0  /* 0xf0-0xff */
+    };
+    static const unsigned char utf8_mask[4] = { 0x7f, 0x1f, 0x0f, 0x07 };
+
+    const char *srcend = src + srclen;
+    int len, res;
+    WCHAR *ret, *dst;
+
+    dst = ret = xmalloc( (srclen + 1) * sizeof(WCHAR) );
+    while (src < srcend)
+    {
+        unsigned char ch = *src++;
+        if (ch < 0x80)  /* special fast case for 7-bit ASCII */
+        {
+            *dst++ = ch;
+            continue;
+        }
+        len = utf8_length[ch - 0x80];
+        if (len && src + len <= srcend)
+        {
+            res = ch & utf8_mask[len];
+            switch (len)
+            {
+            case 3:
+                if ((ch = *src ^ 0x80) >= 0x40) break;
+                res = (res << 6) | ch;
+                src++;
+                if (res < 0x10) break;
+            case 2:
+                if ((ch = *src ^ 0x80) >= 0x40) break;
+                res = (res << 6) | ch;
+                if (res >= 0x110000 >> 6) break;
+                src++;
+                if (res < 0x20) break;
+                if (res >= 0xd800 >> 6 && res <= 0xdfff >> 6) break;
+            case 1:
+                if ((ch = *src ^ 0x80) >= 0x40) break;
+                res = (res << 6) | ch;
+                src++;
+                if (res < 0x80) break;
+                if (res <= 0xffff) *dst++ = res;
+                else
+                {
+                    res -= 0x10000;
+                    *dst++ = 0xd800 | (res >> 10);
+                    *dst++ = 0xdc00 | (res & 0x3ff);
+                }
+                continue;
+            }
+        }
+        *dst++ = 0xfffd;
+    }
+    *dst = 0;
+    *dstlen = dst - ret;
+    return ret;
+}
+
+char *unicode_to_utf8( const WCHAR *src, int srclen, int *dstlen )
+{
+    char *ret, *dst;
+
+    dst = ret = xmalloc( srclen * 3 + 1 );
+    for ( ; srclen; srclen--, src++)
+    {
+        unsigned int ch = *src;
+
+        if (ch < 0x80)  /* 0x00-0x7f: 1 byte */
+        {
+            *dst++ = ch;
+            continue;
+        }
+        if (ch < 0x800)  /* 0x80-0x7ff: 2 bytes */
+        {
+            dst[1] = 0x80 | (ch & 0x3f);
+            ch >>= 6;
+            dst[0] = 0xc0 | ch;
+            dst += 2;
+            continue;
+        }
+        if (ch >= 0xd800 && ch <= 0xdbff && srclen > 1 && src[1] >= 0xdc00 && src[1] <= 0xdfff)
+        {
+            /* 0x10000-0x10ffff: 4 bytes */
+            ch = 0x10000 + ((ch & 0x3ff) << 10) + (src[1] & 0x3ff);
+            dst[3] = 0x80 | (ch & 0x3f);
+            ch >>= 6;
+            dst[2] = 0x80 | (ch & 0x3f);
+            ch >>= 6;
+            dst[1] = 0x80 | (ch & 0x3f);
+            ch >>= 6;
+            dst[0] = 0xf0 | ch;
+            dst += 4;
+            src++;
+            srclen--;
+            continue;
+        }
+        if (ch >= 0xd800 && ch <= 0xdfff) ch = 0xfffd;  /* invalid surrogate pair */
+
+        /* 0x800-0xffff: 3 bytes */
+        dst[2] = 0x80 | (ch & 0x3f);
+        ch >>= 6;
+        dst[1] = 0x80 | (ch & 0x3f);
+        ch >>= 6;
+        dst[0] = 0xe0 | ch;
+        dst += 3;
+    }
+    *dst = 0;
+    *dstlen = dst - ret;
+    return ret;
+}
+
 /*******************************************************************
 *         buffer management
 *
--- a/tools/wmc/utils.h
+++ b/tools/wmc/utils.h
@ -49,6 +49,8 @@ WCHAR *unistrcpy(WCHAR *dst, const WCHAR *src);
 int unistrlen(const WCHAR *s);
 int unistricmp(const WCHAR *s1, const WCHAR *s2);
 int unistrcmp(const WCHAR *s1, const WCHAR *s2);
+WCHAR *utf8_to_unicode( const char *src, int srclen, int *dstlen );
+char *unicode_to_utf8( const WCHAR *src, int srclen, int *dstlen );

 /* buffer management */

--- a/tools/wmc/write.c
+++ b/tools/wmc/write.c
@ -94,17 +94,13 @@ static const char str_header[] =
 	"\n"
        ;

-static char *dup_u2c(int cp, const WCHAR *uc)
+static char *dup_u2c(const WCHAR *uc)
 {
-	int len;
-	char *cptr;
+	int i;
+	char *cptr = xmalloc( unistrlen(uc)+1 );

-        if (!cp) cp = CP_UTF8;
-        len = wmc_wcstombs(cp, 0, uc, unistrlen(uc)+1, NULL, 0);
-	cptr = xmalloc(len);
-        len = wmc_wcstombs(cp, 0, uc, unistrlen(uc)+1, cptr, len);
-	if (len < 0)
-		internal_error(__FILE__, __LINE__, "Buffer overflow? code %d\n", len);
+        for (i = 0; *uc; i++, uc++) cptr[i] = (*uc <= 0xff) ? *uc : '_';
+        cptr[i] = 0;
 	return cptr;
 }

@ -183,7 +179,7 @@ void write_h_file(const char *fname)
 	{
 		if(ttab[i].type == tok_severity && ttab[i].alias)
 		{
-			cptr = dup_u2c(WMC_DEFAULT_CODEPAGE, ttab[i].alias);
+			cptr = dup_u2c(ttab[i].alias);
 			fprintf(fp, "#define %s\t0x%x\n", cptr, ttab[i].token);
 			free(cptr);
 		}
@ -195,7 +191,7 @@ void write_h_file(const char *fname)
 	{
 		if(ttab[i].type == tok_facility && ttab[i].alias)
 		{
-			cptr = dup_u2c(WMC_DEFAULT_CODEPAGE, ttab[i].alias);
+			cptr = dup_u2c(ttab[i].alias);
 			fprintf(fp, "#define %s\t0x%x\n", cptr, ttab[i].token);
 			free(cptr);
 		}
@ -209,7 +205,7 @@ void write_h_file(const char *fname)
 		switch(ndp->type)
 		{
 		case nd_comment:
-			cptr = dup_u2c(WMC_DEFAULT_CODEPAGE, ndp->u.comment+1);
+			cptr = dup_u2c(ndp->u.comment+1);
 			killnl(cptr, 0);
 			killcomment(cptr);
 			if(*cptr)
@ -237,14 +233,14 @@ void write_h_file(const char *fname)
 				fprintf(fp, "\n");
 			}
 			fprintf(fp, "/* MessageId  : 0x%08x */\n", ndp->u.msg->realid);
-			cptr = dup_u2c(ndp->u.msg->msgs[idx_en]->cp, ndp->u.msg->msgs[idx_en]->msg);
+			cptr = dup_u2c(ndp->u.msg->msgs[idx_en]->msg);
 			killnl(cptr, 0);
 			killcomment(cptr);
 			fprintf(fp, "/* Approximate msg: %s */\n", cptr);
 			free(cptr);
-			cptr = dup_u2c(WMC_DEFAULT_CODEPAGE, ndp->u.msg->sym);
+			cptr = dup_u2c(ndp->u.msg->sym);
 			if(ndp->u.msg->cast)
-				cast = dup_u2c(WMC_DEFAULT_CODEPAGE, ndp->u.msg->cast);
+				cast = dup_u2c(ndp->u.msg->cast);
 			else
 				cast = NULL;
 			switch(ndp->u.msg->base)
@ -299,7 +295,7 @@ static void write_rcbin(FILE *fp)
 			if(ttab[i].type == tok_language && ttab[i].token == lbp->lan)
 			{
 				if(ttab[i].alias)
-					cptr = dup_u2c(WMC_DEFAULT_CODEPAGE, ttab[i].alias);
+					cptr = dup_u2c(ttab[i].alias);
 				break;
 			}
 		}
@ -317,7 +313,7 @@ static char *make_string(WCHAR *uc, int len, int codepage)
 	int i;
 	int b;

-	if(!codepage)
+	if (!codepage || codepage == CP_UTF8)
 	{
 		*cptr++ = ' ';
 		*cptr++ = 'L';
@ -379,8 +375,10 @@ static char *make_string(WCHAR *uc, int len, int codepage)
 	else
 	{
 		char *tmp, *cc;
+		int unilen = unistrlen(uc) + 1;

-		cc = tmp = dup_u2c(codepage, uc);
+		cc = tmp = xmalloc( unilen * 2 );
+		wmc_wcstombs( codepage, 0, uc, unilen, cptr, unilen * 2 );
 		*cptr++ = ' ';
 		*cptr++ = '"';
 		for(i = b = 0; i < len; i++, cc++)
@ -539,7 +537,7 @@ void write_bin_files(void)
        {
            if (ttab[i].type == tok_language && ttab[i].token == lbp->lan)
            {
-                if (ttab[i].alias) cptr = dup_u2c(WMC_DEFAULT_CODEPAGE, ttab[i].alias);
+                if (ttab[i].alias) cptr = dup_u2c(ttab[i].alias);
                break;
            }
        }