351 lines
10 KiB
C
351 lines
10 KiB
C
/*
|
|
* Unicode sort key generation
|
|
*
|
|
* Copyright 2003 Dmitry Timoshkov
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this library; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
|
|
*/
|
|
#include "wine/unicode.h"
|
|
|
|
extern int get_decomposition(WCHAR src, WCHAR *dst, unsigned int dstlen);
|
|
extern const unsigned int collation_table[];
|
|
|
|
/*
|
|
* flags - normalization NORM_* flags
|
|
*
|
|
* FIXME: 'variable' flag not handled
|
|
*/
|
|
int wine_get_sortkey(int flags, const WCHAR *src, int srclen, char *dst, int dstlen)
|
|
{
|
|
WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
|
|
int key_len[4];
|
|
char *key_ptr[4];
|
|
const WCHAR *src_save = src;
|
|
int srclen_save = srclen;
|
|
|
|
key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0;
|
|
for (; srclen; srclen--, src++)
|
|
{
|
|
int decomposed_len = 1;/*get_decomposition(*src, dummy, 4);*/
|
|
dummy[0] = *src;
|
|
if (decomposed_len)
|
|
{
|
|
int i;
|
|
for (i = 0; i < decomposed_len; i++)
|
|
{
|
|
WCHAR wch = dummy[i];
|
|
unsigned int ce;
|
|
|
|
/* tests show that win2k just ignores NORM_IGNORENONSPACE,
|
|
* and skips white space and punctuation characters for
|
|
* NORM_IGNORESYMBOLS.
|
|
*/
|
|
if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
|
|
continue;
|
|
|
|
if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
|
|
|
|
ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
|
|
if (ce != (unsigned int)-1)
|
|
{
|
|
if (ce >> 16) key_len[0] += 2;
|
|
if ((ce >> 8) & 0xff) key_len[1]++;
|
|
if ((ce >> 4) & 0x0f) key_len[2]++;
|
|
if (ce & 1)
|
|
{
|
|
if (wch >> 8) key_len[3]++;
|
|
key_len[3]++;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
key_len[0] += 2;
|
|
if (wch >> 8) key_len[0]++;
|
|
if (wch & 0xff) key_len[0]++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!dstlen) /* compute length */
|
|
/* 4 * '\1' + 1 * '\0' + key length */
|
|
return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1;
|
|
|
|
if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1)
|
|
return 0; /* overflow */
|
|
|
|
src = src_save;
|
|
srclen = srclen_save;
|
|
|
|
key_ptr[0] = dst;
|
|
key_ptr[1] = key_ptr[0] + key_len[0] + 1;
|
|
key_ptr[2] = key_ptr[1] + key_len[1] + 1;
|
|
key_ptr[3] = key_ptr[2] + key_len[2] + 1;
|
|
|
|
for (; srclen; srclen--, src++)
|
|
{
|
|
int decomposed_len = 1;/*get_decomposition(*src, dummy, 4);*/
|
|
dummy[0] = *src;
|
|
if (decomposed_len)
|
|
{
|
|
int i;
|
|
for (i = 0; i < decomposed_len; i++)
|
|
{
|
|
WCHAR wch = dummy[i];
|
|
unsigned int ce;
|
|
|
|
/* tests show that win2k just ignores NORM_IGNORENONSPACE,
|
|
* and skips white space and punctuation characters for
|
|
* NORM_IGNORESYMBOLS.
|
|
*/
|
|
if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
|
|
continue;
|
|
|
|
if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
|
|
|
|
ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
|
|
if (ce != (unsigned int)-1)
|
|
{
|
|
WCHAR key;
|
|
if ((key = ce >> 16))
|
|
{
|
|
*key_ptr[0]++ = key >> 8;
|
|
*key_ptr[0]++ = key & 0xff;
|
|
}
|
|
/* make key 1 start from 2 */
|
|
if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1;
|
|
/* make key 2 start from 2 */
|
|
if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1;
|
|
/* key 3 is always a character code */
|
|
if (ce & 1)
|
|
{
|
|
if (wch >> 8) *key_ptr[3]++ = wch >> 8;
|
|
if (wch & 0xff) *key_ptr[3]++ = wch & 0xff;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
*key_ptr[0]++ = 0xff;
|
|
*key_ptr[0]++ = 0xfe;
|
|
if (wch >> 8) *key_ptr[0]++ = wch >> 8;
|
|
if (wch & 0xff) *key_ptr[0]++ = wch & 0xff;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
*key_ptr[0] = '\1';
|
|
*key_ptr[1] = '\1';
|
|
*key_ptr[2] = '\1';
|
|
*key_ptr[3]++ = '\1';
|
|
*key_ptr[3] = 0;
|
|
|
|
return key_ptr[3] - dst;
|
|
}
|
|
|
|
static inline int compare_unicode_weights(int flags, const WCHAR *str1, int len1,
|
|
const WCHAR *str2, int len2)
|
|
{
|
|
unsigned int ce1, ce2;
|
|
int ret;
|
|
|
|
/* 32-bit collation element table format:
|
|
* unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
|
|
* case weight - high 4 bit of low 8 bit.
|
|
*/
|
|
while (len1 > 0 && len2 > 0)
|
|
{
|
|
if (flags & NORM_IGNORESYMBOLS)
|
|
{
|
|
int skip = 0;
|
|
/* FIXME: not tested */
|
|
if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
|
|
{
|
|
str1++;
|
|
len1--;
|
|
skip = 1;
|
|
}
|
|
if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
|
|
{
|
|
str2++;
|
|
len2--;
|
|
skip = 1;
|
|
}
|
|
if (skip) continue;
|
|
}
|
|
|
|
/* hyphen and apostrophe are treated differently depending on
|
|
* whether SORT_STRINGSORT specified or not
|
|
*/
|
|
if (!(flags & SORT_STRINGSORT))
|
|
{
|
|
if (*str1 == '-' || *str1 == '\'')
|
|
{
|
|
if (*str2 != '-' && *str2 != '\'')
|
|
{
|
|
str1++;
|
|
len1--;
|
|
continue;
|
|
}
|
|
}
|
|
else if (*str2 == '-' || *str2 == '\'')
|
|
{
|
|
str2++;
|
|
len2--;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
|
|
ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];
|
|
|
|
if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
|
|
ret = (ce1 >> 16) - (ce2 >> 16);
|
|
else
|
|
ret = *str1 - *str2;
|
|
|
|
if (ret) return ret;
|
|
|
|
str1++;
|
|
str2++;
|
|
len1--;
|
|
len2--;
|
|
}
|
|
return len1 - len2;
|
|
}
|
|
|
|
static inline int compare_diacritic_weights(int flags, const WCHAR *str1, int len1,
|
|
const WCHAR *str2, int len2)
|
|
{
|
|
unsigned int ce1, ce2;
|
|
int ret;
|
|
|
|
/* 32-bit collation element table format:
|
|
* unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
|
|
* case weight - high 4 bit of low 8 bit.
|
|
*/
|
|
while (len1 > 0 && len2 > 0)
|
|
{
|
|
if (flags & NORM_IGNORESYMBOLS)
|
|
{
|
|
int skip = 0;
|
|
/* FIXME: not tested */
|
|
if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
|
|
{
|
|
str1++;
|
|
len1--;
|
|
skip = 1;
|
|
}
|
|
if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
|
|
{
|
|
str2++;
|
|
len2--;
|
|
skip = 1;
|
|
}
|
|
if (skip) continue;
|
|
}
|
|
|
|
ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
|
|
ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];
|
|
|
|
if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
|
|
ret = ((ce1 >> 8) & 0xff) - ((ce2 >> 8) & 0xff);
|
|
else
|
|
ret = *str1 - *str2;
|
|
|
|
if (ret) return ret;
|
|
|
|
str1++;
|
|
str2++;
|
|
len1--;
|
|
len2--;
|
|
}
|
|
return len1 - len2;
|
|
}
|
|
|
|
static inline int compare_case_weights(int flags, const WCHAR *str1, int len1,
|
|
const WCHAR *str2, int len2)
|
|
{
|
|
unsigned int ce1, ce2;
|
|
int ret;
|
|
|
|
/* 32-bit collation element table format:
|
|
* unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
|
|
* case weight - high 4 bit of low 8 bit.
|
|
*/
|
|
while (len1 > 0 && len2 > 0)
|
|
{
|
|
if (flags & NORM_IGNORESYMBOLS)
|
|
{
|
|
int skip = 0;
|
|
/* FIXME: not tested */
|
|
if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
|
|
{
|
|
str1++;
|
|
len1--;
|
|
skip = 1;
|
|
}
|
|
if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
|
|
{
|
|
str2++;
|
|
len2--;
|
|
skip = 1;
|
|
}
|
|
if (skip) continue;
|
|
}
|
|
|
|
ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
|
|
ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];
|
|
|
|
if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
|
|
ret = ((ce1 >> 4) & 0x0f) - ((ce2 >> 4) & 0x0f);
|
|
else
|
|
ret = *str1 - *str2;
|
|
|
|
if (ret) return ret;
|
|
|
|
str1++;
|
|
str2++;
|
|
len1--;
|
|
len2--;
|
|
}
|
|
return len1 - len2;
|
|
}
|
|
|
|
static inline int real_length(const WCHAR *str, int len)
|
|
{
|
|
while (len && !str[len - 1]) len--;
|
|
return len;
|
|
}
|
|
|
|
int wine_compare_string(int flags, const WCHAR *str1, int len1,
|
|
const WCHAR *str2, int len2)
|
|
{
|
|
int ret;
|
|
|
|
len1 = real_length(str1, len1);
|
|
len2 = real_length(str2, len2);
|
|
|
|
ret = compare_unicode_weights(flags, str1, len1, str2, len2);
|
|
if (!ret)
|
|
{
|
|
if (!(flags & NORM_IGNORENONSPACE))
|
|
ret = compare_diacritic_weights(flags, str1, len1, str2, len2);
|
|
if (!ret && !(flags & NORM_IGNORECASE))
|
|
ret = compare_case_weights(flags, str1, len1, str2, len2);
|
|
}
|
|
return ret;
|
|
}
|