Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Commit a99bf5f

Browse files
committed
Adding support for String CompareOptions IgnoreKanaType and IgnoreWidth on Unix.
1 parent 542a151 commit a99bf5f

File tree

1 file changed

+160
-3
lines changed

1 file changed

+160
-3
lines changed

src/corefx/System.Globalization.Native/collation.cpp

Lines changed: 160 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include <assert.h>
88
#include <stdint.h>
9+
#include <vector>
910
#include <map>
1011
#include <unicode/uchar.h>
1112
#include <unicode/ucol.h>
@@ -15,8 +16,8 @@
1516
const int32_t CompareOptionsIgnoreCase = 1;
1617
const int32_t CompareOptionsIgnoreNonSpace = 2;
1718
const int32_t CompareOptionsIgnoreSymbols = 4;
18-
// const int32_t CompareOptionsIgnoreKanaType = 8;
19-
// const int32_t CompareOptionsIgnoreWidth = 0x10;
19+
const int32_t CompareOptionsIgnoreKanaType = 8;
20+
const int32_t CompareOptionsIgnoreWidth = 0x10;
2021
// const int32_t CompareOptionsStringSort = 0x20000000;
2122

2223
typedef std::map<int32_t, UCollator*> TCollatorMap;
@@ -38,6 +39,138 @@ typedef struct _sort_handle
3839

3940
} SortHandle;
4041

42+
// Hiragana character range
43+
const UChar hiraganaStart = 0x3041;
44+
const UChar hiraganaEnd = 0x309e;
45+
const UChar hiraganaToKatakanaOffset = 0x30a1 - 0x3041;
46+
47+
// Mapping between half- and fullwidth characters.
48+
// LowerChars are the characters that should sort lower than HigherChars
49+
const UChar g_HalfFullLowerChars[] = {
50+
// halfwidth characters
51+
0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
52+
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e,
53+
0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d,
54+
0x004e, 0x004f, 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005d,
55+
0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c,
56+
0x006d, 0x006e, 0x006f, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b,
57+
0x007c, 0x007d, 0x007e, 0x00a2, 0x00a3, 0x00ac, 0x00af, 0x00a6, 0x00a5, 0x20a9,
58+
59+
// fullwidth characters
60+
0x3002, 0x300c, 0x300d, 0x3001, 0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5, 0x30e7, 0x30c3,
61+
0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb,
62+
0x30bd, 0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd, 0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8,
63+
0x30db, 0x30de, 0x30df, 0x30e0, 0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec, 0x30ed, 0x30ef,
64+
0x30f3, 0x3099, 0x309a, 0x3164, 0x3131, 0x3132, 0x3133, 0x3134, 0x3135, 0x3136, 0x3137, 0x3138, 0x3139, 0x313a, 0x313b,
65+
0x313c, 0x313d, 0x313e, 0x313f, 0x3140, 0x3141, 0x3142, 0x3143, 0x3144, 0x3145, 0x3146, 0x3147, 0x3148, 0x3149, 0x314a,
66+
0x314b, 0x314c, 0x314d, 0x314e, 0x314f, 0x3150, 0x3151, 0x3152, 0x3153, 0x3154, 0x3155, 0x3156, 0x3157, 0x3158, 0x3159,
67+
0x315a, 0x315b, 0x315c, 0x315d, 0x315e, 0x315f, 0x3160, 0x3161, 0x3162, 0x3163
68+
};
69+
const UChar g_HalfFullHigherChars[] = {
70+
// fullwidth characters
71+
0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f,
72+
0xff10, 0xff11, 0xff12, 0xff13, 0xff14, 0xff15, 0xff16, 0xff17, 0xff18, 0xff19, 0xff1a, 0xff1b, 0xff1c, 0xff1d, 0xff1e,
73+
0xff1f, 0xff20, 0xff21, 0xff22, 0xff23, 0xff24, 0xff25, 0xff26, 0xff27, 0xff28, 0xff29, 0xff2a, 0xff2b, 0xff2c, 0xff2d,
74+
0xff2e, 0xff2f, 0xff30, 0xff31, 0xff32, 0xff33, 0xff34, 0xff35, 0xff36, 0xff37, 0xff38, 0xff39, 0xff3a, 0xff3b, 0xff3d,
75+
0xff3e, 0xff3f, 0xff40, 0xff41, 0xff42, 0xff43, 0xff44, 0xff45, 0xff46, 0xff47, 0xff48, 0xff49, 0xff4a, 0xff4b, 0xff4c,
76+
0xff4d, 0xff4e, 0xff4f, 0xff50, 0xff51, 0xff52, 0xff53, 0xff54, 0xff55, 0xff56, 0xff57, 0xff58, 0xff59, 0xff5a, 0xff5b,
77+
0xff5c, 0xff5d, 0xff5e, 0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6,
78+
79+
// halfwidth characters
80+
0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67, 0xff68, 0xff69, 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f,
81+
0xff70, 0xff71, 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77, 0xff78, 0xff79, 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e,
82+
0xff7f, 0xff80, 0xff81, 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87, 0xff88, 0xff89, 0xff8a, 0xff8b, 0xff8c, 0xff8d,
83+
0xff8e, 0xff8f, 0xff90, 0xff91, 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97, 0xff98, 0xff99, 0xff9a, 0xff9b, 0xff9c,
84+
0xff9d, 0xff9e, 0xff9f, 0xffa0, 0xffa1, 0xffa2, 0xffa3, 0xffa4, 0xffa5, 0xffa6, 0xffa7, 0xffa8, 0xffa9, 0xffaa, 0xffab,
85+
0xffac, 0xffad, 0xffae, 0xffaf, 0xffb0, 0xffb1, 0xffb2, 0xffb3, 0xffb4, 0xffb5, 0xffb6, 0xffb7, 0xffb8, 0xffb9, 0xffba,
86+
0xffbb, 0xffbc, 0xffbd, 0xffbe, 0xffc2, 0xffc3, 0xffc4, 0xffc5, 0xffc6, 0xffc7, 0xffca, 0xffcb, 0xffcc, 0xffcd, 0xffce,
87+
0xffcf, 0xffd2, 0xffd3, 0xffd4, 0xffd5, 0xffd6, 0xffd7, 0xffda, 0xffdb, 0xffdc
88+
};
89+
const int32_t g_HalfFullCharsLength = (sizeof(g_HalfFullHigherChars) / sizeof(UChar));
90+
91+
/*
92+
ICU collation rules reserve any punctuation and whitespace characters for use in the syntax.
93+
Thus, to use these characters in a rule, they need to be escaped.
94+
*/
95+
bool NeedsEscape(UChar character)
96+
{
97+
return ((0x21 <= character && character <= 0x2f)
98+
|| (0x3a <= character && character <= 0x40)
99+
|| (0x5b <= character && character <= 0x60)
100+
|| (0x7b <= character && character <= 0x7e));
101+
}
102+
103+
/*
104+
Gets a string of custom collation rules, if necessary.
105+
106+
Since the CompareOptions flags don't map 1:1 with ICU default functionality, we need to fall back to using
107+
custom rules in order to support IgnoreKanaType and IgnoreWidth CompareOptions correctly.
108+
*/
109+
std::vector<UChar> GetCustomRules(int32_t options, UColAttributeValue strength)
110+
{
111+
bool isIgnoreKanaType = (options & CompareOptionsIgnoreKanaType) == CompareOptionsIgnoreKanaType;
112+
bool isIgnoreWidth = (options & CompareOptionsIgnoreWidth) == CompareOptionsIgnoreWidth;
113+
114+
// kana differs at the quaternary level
115+
bool needsIgnoreKanaTypeCustomRule = isIgnoreKanaType && (strength == UCOL_DEFAULT || strength >= UCOL_QUATERNARY);
116+
bool needsNotIgnoreKanaTypeCustomRule = !isIgnoreKanaType && (strength != UCOL_DEFAULT && strength < UCOL_QUATERNARY);
117+
118+
// character width differs at the tertiary/default level
119+
bool needsIgnoreWidthCustomRule = isIgnoreWidth && (strength == UCOL_DEFAULT || strength >= UCOL_TERTIARY);
120+
bool needsNotIgnoreWidthCustomRule = !isIgnoreWidth && (strength != UCOL_DEFAULT && strength < UCOL_TERTIARY);
121+
122+
std::vector<UChar> customRules;
123+
if (needsIgnoreKanaTypeCustomRule || needsNotIgnoreKanaTypeCustomRule || needsIgnoreWidthCustomRule || needsNotIgnoreWidthCustomRule)
124+
{
125+
// If we need to create customRules, the KanaType custom rule will be 88 kana characters * 4 = 352 chars long
126+
// and the Width custom rule will be at least 215 halfwidth characters * 4 = 860 chars long.
127+
// Use 512 as the starting size, so the customRules won't have to grow if we are just
128+
// doing the KanaType custom rule.
129+
customRules.reserve(512);
130+
131+
if (needsIgnoreKanaTypeCustomRule || needsNotIgnoreKanaTypeCustomRule)
132+
{
133+
UChar compareChar = needsIgnoreKanaTypeCustomRule ? '=' : '<';
134+
135+
for (UChar hiraganaChar = hiraganaStart; hiraganaChar <= hiraganaEnd; hiraganaChar++)
136+
{
137+
// Hiragana is the range 3041 to 3096 & 309D & 309E
138+
if (hiraganaChar <= 0x3096 || hiraganaChar >= 0x309D) // characters between 3096 and 309D are not mapped to katakana
139+
{
140+
customRules.push_back('&');
141+
customRules.push_back(hiraganaChar);
142+
customRules.push_back(compareChar);
143+
customRules.push_back(hiraganaChar + hiraganaToKatakanaOffset);
144+
}
145+
}
146+
}
147+
148+
if (needsIgnoreWidthCustomRule || needsNotIgnoreWidthCustomRule)
149+
{
150+
UChar compareChar = needsIgnoreWidthCustomRule ? '=' : '<';
151+
152+
UChar lowerChar;
153+
for (int i = 0; i < g_HalfFullCharsLength; i++)
154+
{
155+
customRules.push_back('&');
156+
157+
// the lower chars need to be checked for escaping since they contain ASCII punctuation
158+
lowerChar = g_HalfFullLowerChars[i];
159+
if (NeedsEscape(lowerChar))
160+
{
161+
customRules.push_back('\\');
162+
}
163+
customRules.push_back(lowerChar);
164+
165+
customRules.push_back(compareChar);
166+
customRules.push_back(g_HalfFullHigherChars[i]);
167+
}
168+
}
169+
}
170+
171+
return customRules;
172+
}
173+
41174
/*
42175
* The collator returned by this function is owned by the callee and must be
43176
* closed when this method returns with a U_SUCCESS UErrorCode.
@@ -62,7 +195,31 @@ UCollator* CloneCollatorWithOptions(const UCollator* pCollator, int32_t options,
62195
strength = UCOL_PRIMARY;
63196
}
64197

65-
UCollator* pClonedCollator = ucol_safeClone(pCollator, nullptr, nullptr, pErr);
198+
UCollator* pClonedCollator;
199+
std::vector<UChar> customRules = GetCustomRules(options, strength);
200+
if (customRules.empty())
201+
{
202+
pClonedCollator = ucol_safeClone(pCollator, nullptr, nullptr, pErr);
203+
}
204+
else
205+
{
206+
int32_t customRuleLength = customRules.size();
207+
208+
int32_t localeRulesLength;
209+
const UChar* localeRules = ucol_getRules(pCollator, &localeRulesLength);
210+
211+
std::vector<UChar> completeRules(localeRulesLength + customRuleLength + 1, '\0');
212+
for (int i = 0; i < localeRulesLength; i++)
213+
{
214+
completeRules[i] = localeRules[i];
215+
}
216+
for (int i = 0; i < customRuleLength; i++)
217+
{
218+
completeRules[localeRulesLength + i] = customRules[i];
219+
}
220+
221+
pClonedCollator = ucol_openRules(completeRules.data(), completeRules.size(), UCOL_DEFAULT, strength, NULL, pErr);
222+
}
66223

67224
if (isIgnoreSymbols)
68225
{

0 commit comments

Comments
 (0)