dlls/kernel32/locale.c | 2 +- include/wine/unicode.h | 2 +- libs/wine/sortkey.c | 365 +++++++++++++++++++++++++++++++++++++++----- 3 files changed, 325 insertions(+), 44 deletions(-) diff --git a/dlls/kernel32/locale.c b/dlls/kernel32/locale.c index 253032d..9b2b99f 100644 --- a/dlls/kernel32/locale.c +++ b/dlls/kernel32/locale.c @@ -3376,7 +3376,7 @@ INT WINAPI CompareStringEx(LPCWSTR locale, DWORD flags, LPCWSTR str1, INT len1, if (len1 < 0) len1 = strlenW(str1); if (len2 < 0) len2 = strlenW(str2); - ret = wine_compare_string(flags, str1, len1, str2, len2); + ret = wine_compare_string(NULL, flags, str1, len1, str2, len2); if (ret) /* need to translate result */ return (ret < 0) ? CSTR_LESS_THAN : CSTR_GREATER_THAN; diff --git a/include/wine/unicode.h b/include/wine/unicode.h index 35c6166..34e660e 100644 --- a/include/wine/unicode.h +++ b/include/wine/unicode.h @@ -97,7 +97,7 @@ extern int wine_cpsymbol_wcstombs( const WCHAR *src, int srclen, char *dst, int extern int wine_utf8_mbstowcs( int flags, const char *src, int srclen, WCHAR *dst, int dstlen ); extern int wine_utf8_wcstombs( int flags, const WCHAR *src, int srclen, char *dst, int dstlen ); -extern int wine_compare_string( int flags, const WCHAR *str1, int len1, const WCHAR *str2, int len2 ); +extern int wine_compare_string( LCID lcid, int flags, const WCHAR *str1, int len1, const WCHAR *str2, int len2 ); extern int wine_get_sortkey( int flags, const WCHAR *src, int srclen, char *dst, int dstlen ); extern int wine_fold_string( int flags, const WCHAR *src, int srclen , WCHAR *dst, int dstlen ); diff --git a/libs/wine/sortkey.c b/libs/wine/sortkey.c index 7280501..a160c24 100644 --- a/libs/wine/sortkey.c +++ b/libs/wine/sortkey.c @@ -18,6 +18,9 @@ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA */ #include "wine/unicode.h" +#include +#include "wine/debug.h" +WINE_DEFAULT_DEBUG_CHANNEL(string); extern int get_decomposition(WCHAR src, WCHAR *dst, unsigned int dstlen); extern const unsigned int collation_table[]; @@ -155,11 +158,183 @@ int wine_get_sortkey(int flags, const WCHAR *src, int srclen, char *dst, int dst return key_ptr[3] - dst; } -static inline int compare_unicode_weights(int flags, const WCHAR *str1, int len1, +static inline int is_windows_special_character(WCHAR ch) { + return isascii(ch) && !isalnum(ch); +} + +static inline int is_ignored_character_in_word_sort(WCHAR ch) { + return (ch <= 8 || (ch >= 14 && ch <= 31) || ch == 127 || + ch == '\'' || ch == '-'); +} + +static inline int is_hiragana(WCHAR ch) { + return (ch >= 0x3040 && ch <= 0x309F); +} + +static inline int is_wide_katakana(WCHAR ch) { + return (ch >= 0x30A0 && ch <= 0x30FF); +} + +static inline int is_half_katakana(WCHAR ch) { + return (ch >= 0xFF60 && ch <= 0xFF9F); +} + +static inline int is_wide_kana(WCHAR ch) { + return is_hiragana(ch) || is_wide_katakana(ch); +} + +static inline int is_kana(WCHAR ch) { + return is_wide_kana(ch) || is_half_katakana(ch); +} + +static inline int is_kanji(WCHAR ch) { + return (ch >= 0x4E00 && ch <= 0x9FBF); +} + +static inline int is_ascii_latin(WCHAR ch) { + return ((ch >= 'a' && ch <= 'z') || + (ch >= 'A' && ch <= 'Z')); +} + +static inline int is_wide_latin(WCHAR ch) { + return ((ch >= 0xFF21 && ch <= 0xFF39) || + (ch >= 0xFF41 && ch <= 0xFF59)); +} + +static inline WCHAR wide_latin_to_ascii(WCHAR ch) { + if (ch >= 0xFF21 && ch <= 0xFF39) /* AーZ */ + return ch - 0xFF21 + 'A'; + else if (ch >= 0xFF41 && ch <= 0xFF59) /* aーz */ + return ch - 0xFF41 + 'a'; + else + return ch; +} + +static inline int is_latin(WCHAR ch) { + /* Warning: untested */ + int ce = collation_table[collation_table[ch >> 8] + (ch & 0xff)]; + int unicode_weight = ce >> 16; + return (unicode_weight >= 0x0A15 && unicode_weight <= 0x0C13); +} + +static inline int is_japanese(WCHAR ch) { + return is_wide_kana(ch) || is_half_katakana(ch) || is_kanji(ch); +} + +static inline WCHAR katakana_hankaku_to_hanzen(const WCHAR ** str, int *len) { + const WCHAR katakana_map[] = { + /*  。「」、・ヲァィゥェォャュョッ + ーアイウエオカキクケコサシスセソ + タチツテトナニヌネノハヒフヘホマ + ミムメモヤユヨラリルレロワン */ + 0x3000, 0x3002, 0x300c, 0x300d, 0x3001, 0x30fb, 0x30f2, 0x30a1, + 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5, 0x30e7, 0x30c3, + 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab, 0x30ad, + 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd, + 0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, + 0x30cd, 0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, + 0x30df, 0x30e0, 0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, + 0x30ea, 0x30eb, 0x30ec, 0x30ed, 0x30ef, 0x30f3 + }; + + WCHAR ch; + ch = **str; + if (ch < 0xFF9E) { /* it's not a digraph mark */ + ch = katakana_map[ch - 0xFF60]; + } else { + return '?'; + } + + if (*len >= 2) { + if ((*str)[1] == 0xFF9E) { /* dakuten */ + ch += 1; + (*str)++; (*len)--; + } else if ((*str)[1] == 0xFF9F) { /* handakuten */ + ch += 2; + (*str)++; (*len)--; + } + } + + return ch; +} + +static inline int only_symbols_remains(const WCHAR * str1, int len1, const WCHAR * str2, int len2) { + const WCHAR* remaining_str; + int remaining_len; + if (len1 == 0) { + remaining_str = str2; + remaining_len = len2; + } else if (len2 == 0) { + remaining_str = str1; + remaining_len = len1; + } else { + return 0; + } + + while (remaining_len > 0) { + if (get_char_typeW(*remaining_str) & (C1_PUNCT | C1_SPACE)) + { + remaining_str++; + remaining_len--; + } else { + return 0; + } + } + return 1; +} + +static inline int get_windows_special_character_weight(LCID lcid, WCHAR ch) { + /* This is tested for english locale, but works for many others */ + const int basic_weight_table[] = { + /*0*/00, /*1*/10, /*2*/20, /*3*/30, /*4*/40, + /*5*/50, /*6*/60, /*7*/70, /*8*/80, /*9*/310, + /*10*/320, /*11*/330, /*12*/340, /*13*/350, /*14*/90, + /*15*/100, /*16*/110, /*17*/120, /*18*/130, /*19*/140, + /*20*/150, /*21*/160, /*22*/170, /*23*/180, /*24*/190, + /*25*/200, /*26*/210, /*27*/220, /*28*/230, /*29*/240, + /*30*/250, /*31*/260, /*32*/300, /*33:!*/360, /*34:"*/370, + /*35:#*/380, /*36:$*/390, /*37:%*/400, /*38:&*/410, /*39:'*/280, + /*40:(*/420, /*41:)*/430, /*42:**/440, /*43:+*/620, /*44:0,*/450, + /*45:-*/290, /*46:.*/460, /*47:/*/470, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* [0-9] */ + /*58::*/480, /*59:;*/490, /*60:<*/630, /*61:=*/640, + /*62:>*/650, /*63:?*/500, /*64:@*/510, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* [A-Z] */ + 00, /*91:[*/520, /*92:\*/530, /*93:]*/540, + /*94:^*/550, /*95:_*/560, /*96:`*/570, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* [a-z] */ + /*123:{*/580, /*124:|*/590, /*125:}*/600, /*126:~*/610, /*127*/270 + }; + + if (ch <= 127) { + /* Some languages have diferences on how symbols are ordered in Windows. + * They can be written below. */ + if (((lcid & LANG_JAPANESE) || (lcid & LANG_KOREAN)) && ch == '\\') { + return 651; /* In japanese and korean locales, backslash (which is often displayed as + yen or won symbol, respectively, instead) is sorted just after '>' and + before [0-9]. */ + } else { + return basic_weight_table[ch]; + } + } else { + ERR("get_windows_special_character_weight has received a non-ascii char."); + return 0; + } +} + +static inline int compare_unicode_weights(LCID lcid, int flags, const WCHAR *str1, int len1, const WCHAR *str2, int len2) { unsigned int ce1, ce2; int ret; + int last_no_ignored1 = 1, last_no_ignored2 = 1; + + while (last_no_ignored1 <= len1 && + is_ignored_character_in_word_sort(str1[len1-last_no_ignored1])) + last_no_ignored1++; + while (last_no_ignored2 <= len2 && + is_ignored_character_in_word_sort(str2[len2-last_no_ignored2])) + last_no_ignored2++; /* 32-bit collation element table format: * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit, @@ -191,30 +366,34 @@ static inline int compare_unicode_weights(int flags, const WCHAR *str1, int len1 */ if (!(flags & SORT_STRINGSORT)) { - if (*str1 == '-' || *str1 == '\'') - { - if (*str2 != '-' && *str2 != '\'') - { - str1++; - len1--; - continue; - } - } - else if (*str2 == '-' || *str2 == '\'') + if (len1 > last_no_ignored1 && is_ignored_character_in_word_sort(*str1)) { + str1++; + len1--; + continue; + } else if (len2 > last_no_ignored2 && is_ignored_character_in_word_sort(*str2)) { str2++; len2--; continue; } } - ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)]; - ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)]; - - if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1) - ret = (ce1 >> 16) - (ce2 >> 16); - else - ret = *str1 - *str2; + if (!(is_windows_special_character(*str1) && + is_windows_special_character(*str2))) + { + ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)]; + ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)]; + + if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1) + ret = (ce1 >> 16) - (ce2 >> 16); + else + ret = *str1 - *str2; + } else { + int weight1, weight2; + weight1 = get_windows_special_character_weight(lcid, *str1); + weight2 = get_windows_special_character_weight(lcid, *str2); + ret = weight1 - weight2; + } if (ret) return ret; @@ -233,14 +412,25 @@ static inline int compare_unicode_weights(int flags, const WCHAR *str1, int len1 str2++; len2--; } + /* if NORM_IGNORESYMBOLS is set, "dream," must match "dream" as equal. */ + if ((flags & NORM_IGNORESYMBOLS) && only_symbols_remains(str1, len1, str2, len2)) + return 0; return len1 - len2; } -static inline int compare_diacritic_weights(int flags, const WCHAR *str1, int len1, - const WCHAR *str2, int len2) +static inline int compare_case_weights(LCID lcid, int flags, const WCHAR *str1, int len1, + const WCHAR *str2, int len2) { unsigned int ce1, ce2; int ret; + int last_no_ignored1 = 1, last_no_ignored2 = 1; + + while (last_no_ignored1 <= len1 && + is_ignored_character_in_word_sort(str1[len1-last_no_ignored1])) + last_no_ignored1++; + while (last_no_ignored2 <= len2 && + is_ignored_character_in_word_sort(str2[len2-last_no_ignored2])) + last_no_ignored2++; /* 32-bit collation element table format: * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit, @@ -267,13 +457,69 @@ static inline int compare_diacritic_weights(int flags, const WCHAR *str1, int le if (skip) continue; } - ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)]; - ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)]; + /* hyphen and apostrophe are treated differently depending on + * whether SORT_STRINGSORT specified or not + */ + if (!(flags & SORT_STRINGSORT)) + { + if (len1 > last_no_ignored1 && is_ignored_character_in_word_sort(*str1)) + { + str1++; + len1--; + continue; + } else if (len2 > last_no_ignored2 && is_ignored_character_in_word_sort(*str2)) { + str2++; + len2--; + continue; + } + } - if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1) - ret = ((ce1 >> 8) & 0xff) - ((ce2 >> 8) & 0xff); - else - ret = *str1 - *str2; + if (!(is_windows_special_character(*str1) && + is_windows_special_character(*str2))) + { + int case1, case2; + ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)]; + ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)]; + + case1 = (ce1 >> 4) &0x0f; + case2 = (ce2 >> 4) &0x0f; + ret = case1 - case2; + + if (ret) { + if (is_kana(*str1) && is_kana(*str2)) { + if (flags & NORM_IGNOREKANATYPE) { + if (flags & NORM_IGNOREWIDTH) { + ret = 0; + } else if (!is_half_katakana(*str1) && !is_half_katakana(*str2)) { + ret = 0; + } + } else if (flags & NORM_IGNOREWIDTH) { + if (!is_hiragana(*str1) && !is_hiragana(*str2)) { + ret = 0; + } + } + } else if (is_latin(*str1) && is_latin(*str2)) { + if ((flags & NORM_IGNOREWIDTH) && (flags & NORM_IGNORECASE)) { + ret = 0; + } else if (flags & NORM_IGNOREWIDTH) { + /* Warning: Manual test seems to reveal width differences let a case field + * difference of 1, but I am unsure of this. */ + if (ret == 1 || ret == -1) { + ret = 0; + } + } else if (flags & NORM_IGNORECASE) { + ret = 0; + } + } else if (flags & NORM_IGNORECASE) { + ret = 0; + } + } + } else { + int weight1, weight2; + weight1 = get_windows_special_character_weight(lcid, *str1); + weight2 = get_windows_special_character_weight(lcid, *str2); + ret = weight1 - weight2; + } if (ret) return ret; @@ -292,13 +538,35 @@ static inline int compare_diacritic_weights(int flags, const WCHAR *str1, int le str2++; len2--; } + /* if NORM_IGNORESYMBOLS is set, "dream," must match "dream" as equal. */ + if ((flags & NORM_IGNORESYMBOLS) && only_symbols_remains(str1, len1, str2, len2)) + return 0; return len1 - len2; } -static inline int compare_case_weights(int flags, const WCHAR *str1, int len1, - const WCHAR *str2, int len2) +static inline WCHAR convert_to_diacritic_comparable(int flags, const WCHAR **str, int *len) { + WCHAR ch; + ch = **str; + if (flags & NORM_IGNOREWIDTH) { + if (is_half_katakana(ch)) + ch = katakana_hankaku_to_hanzen(str, len); + if (is_wide_latin(ch)) + ch = wide_latin_to_ascii(ch); + } + if (flags & NORM_IGNOREKANATYPE) { + if (is_wide_katakana(ch)) + ch -= 0x60; /* convert to hiragana */ + } + if (flags & NORM_IGNORECASE) { + if (isalphaW(ch)) + ch = tolowerW(ch); + } + return ch; +} + +static inline int compare_diacritic_weights(LCID lcid, int flags, const WCHAR *str1, int len1, + const WCHAR *str2, int len2) { - unsigned int ce1, ce2; int ret; /* 32-bit collation element table format: @@ -326,13 +594,21 @@ static inline int compare_case_weights(int flags, const WCHAR *str1, int len1, if (skip) continue; } - ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)]; - ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)]; - - if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1) - ret = ((ce1 >> 4) & 0x0f) - ((ce2 >> 4) & 0x0f); - else - ret = *str1 - *str2; + if (!(is_windows_special_character(*str1) && + is_windows_special_character(*str2))) + { + WCHAR ch1, ch2; + ch1 = convert_to_diacritic_comparable(flags, &str1, &len1); + ch2 = convert_to_diacritic_comparable(flags, &str2, &len2); + + /* To put it simply, Wine collation table does not work with diacritics */ + ret = ch1 - ch2; + } else { + int weight1, weight2; + weight1 = get_windows_special_character_weight(lcid, *str1); + weight2 = get_windows_special_character_weight(lcid, *str2); + ret = weight1 - weight2; + } if (ret) return ret; @@ -351,21 +627,26 @@ static inline int compare_case_weights(int flags, const WCHAR *str1, int len1, str2++; len2--; } + /* if NORM_IGNORESYMBOLS is set, "dream," must match "dream" as equal. */ + if ((flags & NORM_IGNORESYMBOLS) && only_symbols_remains(str1, len1, str2, len2)) + return 0; return len1 - len2; } -int wine_compare_string(int flags, const WCHAR *str1, int len1, +int wine_compare_string(LCID lcid, int flags, const WCHAR *str1, int len1, const WCHAR *str2, int len2) { int ret; - ret = compare_unicode_weights(flags, str1, len1, str2, len2); + ret = compare_unicode_weights(lcid, flags, str1, len1, str2, len2); if (!ret) { - if (!(flags & NORM_IGNORENONSPACE)) - ret = compare_diacritic_weights(flags, str1, len1, str2, len2); - if (!ret && !(flags & NORM_IGNORECASE)) - ret = compare_case_weights(flags, str1, len1, str2, len2); + if ((flags & (NORM_IGNORECASE | NORM_IGNOREKANATYPE | NORM_IGNOREWIDTH)) != + (NORM_IGNORECASE | NORM_IGNOREKANATYPE | NORM_IGNOREWIDTH)) + ret = compare_case_weights(lcid, flags, str1, len1, str2, len2); + + if (!ret && !(flags & NORM_IGNORENONSPACE)) + ret = compare_diacritic_weights(lcid, flags, str1, len1, str2, len2); } return ret; }