66
77#include < assert.h>
88#include < stdint.h>
9+ #include < vector>
910#include < map>
1011#include < unicode/uchar.h>
1112#include < unicode/ucol.h>
1516const int32_t CompareOptionsIgnoreCase = 1 ;
1617const int32_t CompareOptionsIgnoreNonSpace = 2 ;
1718const int32_t CompareOptionsIgnoreSymbols = 4 ;
18- // const int32_t CompareOptionsIgnoreKanaType = 8;
19- // const int32_t CompareOptionsIgnoreWidth = 0x10;
19+ const int32_t CompareOptionsIgnoreKanaType = 8 ;
20+ const int32_t CompareOptionsIgnoreWidth = 0x10 ;
2021// const int32_t CompareOptionsStringSort = 0x20000000;
2122
2223typedef std::map<int32_t , UCollator*> TCollatorMap;
@@ -38,6 +39,138 @@ typedef struct _sort_handle
3839
3940} SortHandle;
4041
42+ // Hiragana character range
43+ const UChar hiraganaStart = 0x3041 ;
44+ const UChar hiraganaEnd = 0x309e ;
45+ const UChar hiraganaToKatakanaOffset = 0x30a1 - 0x3041 ;
46+
47+ // Mapping between half- and fullwidth characters.
48+ // LowerChars are the characters that should sort lower than HigherChars
49+ const UChar g_HalfFullLowerChars[] = {
50+ // halfwidth characters
51+ 0x0021 , 0x0022 , 0x0023 , 0x0024 , 0x0025 , 0x0026 , 0x0027 , 0x0028 , 0x0029 , 0x002a , 0x002b , 0x002c , 0x002d , 0x002e , 0x002f ,
52+ 0x0030 , 0x0031 , 0x0032 , 0x0033 , 0x0034 , 0x0035 , 0x0036 , 0x0037 , 0x0038 , 0x0039 , 0x003a , 0x003b , 0x003c , 0x003d , 0x003e ,
53+ 0x003f , 0x0040 , 0x0041 , 0x0042 , 0x0043 , 0x0044 , 0x0045 , 0x0046 , 0x0047 , 0x0048 , 0x0049 , 0x004a , 0x004b , 0x004c , 0x004d ,
54+ 0x004e , 0x004f , 0x0050 , 0x0051 , 0x0052 , 0x0053 , 0x0054 , 0x0055 , 0x0056 , 0x0057 , 0x0058 , 0x0059 , 0x005a , 0x005b , 0x005d ,
55+ 0x005e , 0x005f , 0x0060 , 0x0061 , 0x0062 , 0x0063 , 0x0064 , 0x0065 , 0x0066 , 0x0067 , 0x0068 , 0x0069 , 0x006a , 0x006b , 0x006c ,
56+ 0x006d , 0x006e , 0x006f , 0x0070 , 0x0071 , 0x0072 , 0x0073 , 0x0074 , 0x0075 , 0x0076 , 0x0077 , 0x0078 , 0x0079 , 0x007a , 0x007b ,
57+ 0x007c , 0x007d , 0x007e , 0x00a2 , 0x00a3 , 0x00ac , 0x00af , 0x00a6 , 0x00a5 , 0x20a9 ,
58+
59+ // fullwidth characters
60+ 0x3002 , 0x300c , 0x300d , 0x3001 , 0x30fb , 0x30f2 , 0x30a1 , 0x30a3 , 0x30a5 , 0x30a7 , 0x30a9 , 0x30e3 , 0x30e5 , 0x30e7 , 0x30c3 ,
61+ 0x30fc , 0x30a2 , 0x30a4 , 0x30a6 , 0x30a8 , 0x30aa , 0x30ab , 0x30ad , 0x30af , 0x30b1 , 0x30b3 , 0x30b5 , 0x30b7 , 0x30b9 , 0x30bb ,
62+ 0x30bd , 0x30bf , 0x30c1 , 0x30c4 , 0x30c6 , 0x30c8 , 0x30ca , 0x30cb , 0x30cc , 0x30cd , 0x30ce , 0x30cf , 0x30d2 , 0x30d5 , 0x30d8 ,
63+ 0x30db , 0x30de , 0x30df , 0x30e0 , 0x30e1 , 0x30e2 , 0x30e4 , 0x30e6 , 0x30e8 , 0x30e9 , 0x30ea , 0x30eb , 0x30ec , 0x30ed , 0x30ef ,
64+ 0x30f3 , 0x3099 , 0x309a , 0x3164 , 0x3131 , 0x3132 , 0x3133 , 0x3134 , 0x3135 , 0x3136 , 0x3137 , 0x3138 , 0x3139 , 0x313a , 0x313b ,
65+ 0x313c , 0x313d , 0x313e , 0x313f , 0x3140 , 0x3141 , 0x3142 , 0x3143 , 0x3144 , 0x3145 , 0x3146 , 0x3147 , 0x3148 , 0x3149 , 0x314a ,
66+ 0x314b , 0x314c , 0x314d , 0x314e , 0x314f , 0x3150 , 0x3151 , 0x3152 , 0x3153 , 0x3154 , 0x3155 , 0x3156 , 0x3157 , 0x3158 , 0x3159 ,
67+ 0x315a , 0x315b , 0x315c , 0x315d , 0x315e , 0x315f , 0x3160 , 0x3161 , 0x3162 , 0x3163
68+ };
69+ const UChar g_HalfFullHigherChars[] = {
70+ // fullwidth characters
71+ 0xff01 , 0xff02 , 0xff03 , 0xff04 , 0xff05 , 0xff06 , 0xff07 , 0xff08 , 0xff09 , 0xff0a , 0xff0b , 0xff0c , 0xff0d , 0xff0e , 0xff0f ,
72+ 0xff10 , 0xff11 , 0xff12 , 0xff13 , 0xff14 , 0xff15 , 0xff16 , 0xff17 , 0xff18 , 0xff19 , 0xff1a , 0xff1b , 0xff1c , 0xff1d , 0xff1e ,
73+ 0xff1f , 0xff20 , 0xff21 , 0xff22 , 0xff23 , 0xff24 , 0xff25 , 0xff26 , 0xff27 , 0xff28 , 0xff29 , 0xff2a , 0xff2b , 0xff2c , 0xff2d ,
74+ 0xff2e , 0xff2f , 0xff30 , 0xff31 , 0xff32 , 0xff33 , 0xff34 , 0xff35 , 0xff36 , 0xff37 , 0xff38 , 0xff39 , 0xff3a , 0xff3b , 0xff3d ,
75+ 0xff3e , 0xff3f , 0xff40 , 0xff41 , 0xff42 , 0xff43 , 0xff44 , 0xff45 , 0xff46 , 0xff47 , 0xff48 , 0xff49 , 0xff4a , 0xff4b , 0xff4c ,
76+ 0xff4d , 0xff4e , 0xff4f , 0xff50 , 0xff51 , 0xff52 , 0xff53 , 0xff54 , 0xff55 , 0xff56 , 0xff57 , 0xff58 , 0xff59 , 0xff5a , 0xff5b ,
77+ 0xff5c , 0xff5d , 0xff5e , 0xffe0 , 0xffe1 , 0xffe2 , 0xffe3 , 0xffe4 , 0xffe5 , 0xffe6 ,
78+
79+ // halfwidth characters
80+ 0xff61 , 0xff62 , 0xff63 , 0xff64 , 0xff65 , 0xff66 , 0xff67 , 0xff68 , 0xff69 , 0xff6a , 0xff6b , 0xff6c , 0xff6d , 0xff6e , 0xff6f ,
81+ 0xff70 , 0xff71 , 0xff72 , 0xff73 , 0xff74 , 0xff75 , 0xff76 , 0xff77 , 0xff78 , 0xff79 , 0xff7a , 0xff7b , 0xff7c , 0xff7d , 0xff7e ,
82+ 0xff7f , 0xff80 , 0xff81 , 0xff82 , 0xff83 , 0xff84 , 0xff85 , 0xff86 , 0xff87 , 0xff88 , 0xff89 , 0xff8a , 0xff8b , 0xff8c , 0xff8d ,
83+ 0xff8e , 0xff8f , 0xff90 , 0xff91 , 0xff92 , 0xff93 , 0xff94 , 0xff95 , 0xff96 , 0xff97 , 0xff98 , 0xff99 , 0xff9a , 0xff9b , 0xff9c ,
84+ 0xff9d , 0xff9e , 0xff9f , 0xffa0 , 0xffa1 , 0xffa2 , 0xffa3 , 0xffa4 , 0xffa5 , 0xffa6 , 0xffa7 , 0xffa8 , 0xffa9 , 0xffaa , 0xffab ,
85+ 0xffac , 0xffad , 0xffae , 0xffaf , 0xffb0 , 0xffb1 , 0xffb2 , 0xffb3 , 0xffb4 , 0xffb5 , 0xffb6 , 0xffb7 , 0xffb8 , 0xffb9 , 0xffba ,
86+ 0xffbb , 0xffbc , 0xffbd , 0xffbe , 0xffc2 , 0xffc3 , 0xffc4 , 0xffc5 , 0xffc6 , 0xffc7 , 0xffca , 0xffcb , 0xffcc , 0xffcd , 0xffce ,
87+ 0xffcf , 0xffd2 , 0xffd3 , 0xffd4 , 0xffd5 , 0xffd6 , 0xffd7 , 0xffda , 0xffdb , 0xffdc
88+ };
89+ const int32_t g_HalfFullCharsLength = (sizeof (g_HalfFullHigherChars) / sizeof (UChar));
90+
91+ /*
92+ ICU collation rules reserve any punctuation and whitespace characters for use in the syntax.
93+ Thus, to use these characters in a rule, they need to be escaped.
94+ */
95+ bool NeedsEscape (UChar character)
96+ {
97+ return ((0x21 <= character && character <= 0x2f )
98+ || (0x3a <= character && character <= 0x40 )
99+ || (0x5b <= character && character <= 0x60 )
100+ || (0x7b <= character && character <= 0x7e ));
101+ }
102+
103+ /*
104+ Gets a string of custom collation rules, if necessary.
105+
106+ Since the CompareOptions flags don't map 1:1 with ICU default functionality, we need to fall back to using
107+ custom rules in order to support IgnoreKanaType and IgnoreWidth CompareOptions correctly.
108+ */
109+ std::vector<UChar> GetCustomRules (int32_t options, UColAttributeValue strength)
110+ {
111+ bool isIgnoreKanaType = (options & CompareOptionsIgnoreKanaType) == CompareOptionsIgnoreKanaType;
112+ bool isIgnoreWidth = (options & CompareOptionsIgnoreWidth) == CompareOptionsIgnoreWidth;
113+
114+ // kana differs at the quaternary level
115+ bool needsIgnoreKanaTypeCustomRule = isIgnoreKanaType && (strength == UCOL_DEFAULT || strength >= UCOL_QUATERNARY);
116+ bool needsNotIgnoreKanaTypeCustomRule = !isIgnoreKanaType && (strength != UCOL_DEFAULT && strength < UCOL_QUATERNARY);
117+
118+ // character width differs at the tertiary/default level
119+ bool needsIgnoreWidthCustomRule = isIgnoreWidth && (strength == UCOL_DEFAULT || strength >= UCOL_TERTIARY);
120+ bool needsNotIgnoreWidthCustomRule = !isIgnoreWidth && (strength != UCOL_DEFAULT && strength < UCOL_TERTIARY);
121+
122+ std::vector<UChar> customRules;
123+ if (needsIgnoreKanaTypeCustomRule || needsNotIgnoreKanaTypeCustomRule || needsIgnoreWidthCustomRule || needsNotIgnoreWidthCustomRule)
124+ {
125+ // If we need to create customRules, the KanaType custom rule will be 88 kana characters * 4 = 352 chars long
126+ // and the Width custom rule will be at least 215 halfwidth characters * 4 = 860 chars long.
127+ // Use 512 as the starting size, so the customRules won't have to grow if we are just
128+ // doing the KanaType custom rule.
129+ customRules.reserve (512 );
130+
131+ if (needsIgnoreKanaTypeCustomRule || needsNotIgnoreKanaTypeCustomRule)
132+ {
133+ UChar compareChar = needsIgnoreKanaTypeCustomRule ? ' =' : ' <' ;
134+
135+ for (UChar hiraganaChar = hiraganaStart; hiraganaChar <= hiraganaEnd; hiraganaChar++)
136+ {
137+ // Hiragana is the range 3041 to 3096 & 309D & 309E
138+ if (hiraganaChar <= 0x3096 || hiraganaChar >= 0x309D ) // characters between 3096 and 309D are not mapped to katakana
139+ {
140+ customRules.push_back (' &' );
141+ customRules.push_back (hiraganaChar);
142+ customRules.push_back (compareChar);
143+ customRules.push_back (hiraganaChar + hiraganaToKatakanaOffset);
144+ }
145+ }
146+ }
147+
148+ if (needsIgnoreWidthCustomRule || needsNotIgnoreWidthCustomRule)
149+ {
150+ UChar compareChar = needsIgnoreWidthCustomRule ? ' =' : ' <' ;
151+
152+ UChar lowerChar;
153+ for (int i = 0 ; i < g_HalfFullCharsLength; i++)
154+ {
155+ customRules.push_back (' &' );
156+
157+ // the lower chars need to be checked for escaping since they contain ASCII punctuation
158+ lowerChar = g_HalfFullLowerChars[i];
159+ if (NeedsEscape (lowerChar))
160+ {
161+ customRules.push_back (' \\ ' );
162+ }
163+ customRules.push_back (lowerChar);
164+
165+ customRules.push_back (compareChar);
166+ customRules.push_back (g_HalfFullHigherChars[i]);
167+ }
168+ }
169+ }
170+
171+ return customRules;
172+ }
173+
41174/*
42175 * The collator returned by this function is owned by the callee and must be
43176 * closed when this method returns with a U_SUCCESS UErrorCode.
@@ -62,7 +195,31 @@ UCollator* CloneCollatorWithOptions(const UCollator* pCollator, int32_t options,
62195 strength = UCOL_PRIMARY;
63196 }
64197
65- UCollator* pClonedCollator = ucol_safeClone (pCollator, nullptr , nullptr , pErr);
198+ UCollator* pClonedCollator;
199+ std::vector<UChar> customRules = GetCustomRules (options, strength);
200+ if (customRules.empty ())
201+ {
202+ pClonedCollator = ucol_safeClone (pCollator, nullptr , nullptr , pErr);
203+ }
204+ else
205+ {
206+ int32_t customRuleLength = customRules.size ();
207+
208+ int32_t localeRulesLength;
209+ const UChar* localeRules = ucol_getRules (pCollator, &localeRulesLength);
210+
211+ std::vector<UChar> completeRules (localeRulesLength + customRuleLength + 1 , ' \0 ' );
212+ for (int i = 0 ; i < localeRulesLength; i++)
213+ {
214+ completeRules[i] = localeRules[i];
215+ }
216+ for (int i = 0 ; i < customRuleLength; i++)
217+ {
218+ completeRules[localeRulesLength + i] = customRules[i];
219+ }
220+
221+ pClonedCollator = ucol_openRules (completeRules.data (), completeRules.size (), UCOL_DEFAULT, strength, NULL , pErr);
222+ }
66223
67224 if (isIgnoreSymbols)
68225 {
0 commit comments