Skip to content

Commit a47de54

Browse files
committed
add the grapheme cluster break automaton
1 parent 7f56b78 commit a47de54

File tree

3 files changed

+464
-1
lines changed

3 files changed

+464
-1
lines changed

Modules/clinic/unicodedata.c.h

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -406,4 +406,33 @@ unicodedata_UCD_lookup(PyObject *self, PyObject *arg)
406406
exit:
407407
return return_value;
408408
}
409-
/*[clinic end generated code: output=a14dae8898d6b123 input=a9049054013a1b77]*/
409+
410+
PyDoc_STRVAR(unicodedata_UCD_break_graphemes__doc__,
411+
"break_graphemes($self, unistr, /)\n"
412+
"--\n"
413+
"\n"
414+
"Returns an iterator to iterate over grapheme clusters in unistr.\n"
415+
"\n"
416+
"It uses extended grapheme cluster rules from TR29.");
417+
418+
#define UNICODEDATA_UCD_BREAK_GRAPHEMES_METHODDEF \
419+
{"break_graphemes", (PyCFunction)unicodedata_UCD_break_graphemes, METH_O, unicodedata_UCD_break_graphemes__doc__},
420+
421+
static PyObject *
422+
unicodedata_UCD_break_graphemes_impl(PyObject *self, PyObject *unistr);
423+
424+
static PyObject *
425+
unicodedata_UCD_break_graphemes(PyObject *self, PyObject *arg)
426+
{
427+
PyObject *return_value = NULL;
428+
PyObject *unistr;
429+
430+
if (!PyArg_Parse(arg, "U:break_graphemes", &unistr)) {
431+
goto exit;
432+
}
433+
return_value = unicodedata_UCD_break_graphemes_impl(self, unistr);
434+
435+
exit:
436+
return return_value;
437+
}
438+
/*[clinic end generated code: output=e7aa6367f1c3caf3 input=a9049054013a1b77]*/
Lines changed: 327 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,327 @@
1+
typedef enum {
2+
CR,
3+
LF,
4+
Control,
5+
Extend,
6+
ZWJ,
7+
Regional_Indicator,
8+
Prepend,
9+
SpacingMark,
10+
L,
11+
V,
12+
T,
13+
LV,
14+
LVT,
15+
E_Base,
16+
E_Modifier,
17+
Glue_After_Zwj,
18+
E_Base_GAZ,
19+
Any,
20+
eot
21+
} GraphemeClusterBreakType;
22+
23+
typedef enum {
24+
STATE_BREAK,
25+
STATE_sot,
26+
STATE_eot,
27+
STATE_CR,
28+
STATE_LF,
29+
STATE_Control,
30+
STATE_L,
31+
STATE_V_or_LV,
32+
STATE_T_or_LVT,
33+
STATE_Prepend,
34+
STATE_ZWJ,
35+
STATE_Emoji,
36+
STATE_RI_1,
37+
STATE_RI_2,
38+
STATE_Any,
39+
} GCBState;
40+
41+
static GCBState GRAPH_CLUSTER_AUTOMATON[15][20] = {
42+
[STATE_BREAK] = {STATE_BREAK,
43+
STATE_BREAK,
44+
STATE_BREAK,
45+
STATE_BREAK,
46+
STATE_BREAK,
47+
STATE_BREAK,
48+
STATE_BREAK,
49+
STATE_BREAK,
50+
STATE_BREAK,
51+
STATE_BREAK,
52+
STATE_BREAK,
53+
STATE_BREAK,
54+
STATE_BREAK,
55+
STATE_BREAK,
56+
STATE_BREAK,
57+
STATE_BREAK,
58+
STATE_BREAK,
59+
STATE_BREAK,
60+
STATE_BREAK},
61+
[STATE_sot] = {[CR] = STATE_CR,
62+
[LF] = STATE_LF,
63+
[Control] = STATE_Control,
64+
[Extend] = STATE_Any,
65+
[ZWJ] = STATE_ZWJ,
66+
[Regional_Indicator] = STATE_RI_1,
67+
[Prepend] = STATE_Prepend,
68+
[SpacingMark] = STATE_Any,
69+
[L] = STATE_L,
70+
[V] = STATE_V_or_LV,
71+
[T] = STATE_T_or_LVT,
72+
[LV] = STATE_V_or_LV,
73+
[LVT] = STATE_T_or_LVT,
74+
[E_Base] = STATE_Emoji,
75+
[E_Modifier] = STATE_Any,
76+
[Glue_After_Zwj] = STATE_Any,
77+
[E_Base_GAZ] = STATE_Emoji,
78+
[Any] = STATE_Any,
79+
[eot] = STATE_eot},
80+
[STATE_eot] = {[CR] = STATE_BREAK,
81+
[LF] = STATE_BREAK,
82+
[Control] = STATE_BREAK,
83+
[Extend] = STATE_BREAK,
84+
[ZWJ] = STATE_BREAK,
85+
[Regional_Indicator] = STATE_BREAK,
86+
[Prepend] = STATE_BREAK,
87+
[SpacingMark] = STATE_BREAK,
88+
[L] = STATE_BREAK,
89+
[V] = STATE_BREAK,
90+
[T] = STATE_BREAK,
91+
[LV] = STATE_BREAK,
92+
[LVT] = STATE_BREAK,
93+
[E_Base] = STATE_BREAK,
94+
[E_Modifier] = STATE_BREAK,
95+
[Glue_After_Zwj] = STATE_BREAK,
96+
[E_Base_GAZ] = STATE_BREAK,
97+
[Any] = STATE_BREAK,
98+
[eot] = STATE_BREAK},
99+
[STATE_CR] = {[CR] = STATE_BREAK,
100+
[LF] = STATE_LF,
101+
[Control] = STATE_BREAK,
102+
[Extend] = STATE_BREAK,
103+
[ZWJ] = STATE_BREAK,
104+
[Regional_Indicator] = STATE_BREAK,
105+
[Prepend] = STATE_BREAK,
106+
[SpacingMark] = STATE_BREAK,
107+
[L] = STATE_BREAK,
108+
[V] = STATE_BREAK,
109+
[T] = STATE_BREAK,
110+
[LV] = STATE_BREAK,
111+
[LVT] = STATE_BREAK,
112+
[E_Base] = STATE_BREAK,
113+
[E_Modifier] = STATE_BREAK,
114+
[Glue_After_Zwj] = STATE_BREAK,
115+
[E_Base_GAZ] = STATE_BREAK,
116+
[Any] = STATE_BREAK,
117+
[eot] = STATE_BREAK},
118+
[STATE_LF] = {[CR] = STATE_BREAK,
119+
[LF] = STATE_BREAK,
120+
[Control] = STATE_BREAK,
121+
[Extend] = STATE_BREAK,
122+
[ZWJ] = STATE_BREAK,
123+
[Regional_Indicator] = STATE_BREAK,
124+
[Prepend] = STATE_BREAK,
125+
[SpacingMark] = STATE_BREAK,
126+
[L] = STATE_BREAK,
127+
[V] = STATE_BREAK,
128+
[T] = STATE_BREAK,
129+
[LV] = STATE_BREAK,
130+
[LVT] = STATE_BREAK,
131+
[E_Base] = STATE_BREAK,
132+
[E_Modifier] = STATE_BREAK,
133+
[Glue_After_Zwj] = STATE_BREAK,
134+
[E_Base_GAZ] = STATE_BREAK,
135+
[Any] = STATE_BREAK,
136+
[eot] = STATE_BREAK},
137+
[STATE_Control] = {[CR] = STATE_BREAK,
138+
[LF] = STATE_BREAK,
139+
[Control] = STATE_BREAK,
140+
[Extend] = STATE_BREAK,
141+
[ZWJ] = STATE_BREAK,
142+
[Regional_Indicator] = STATE_BREAK,
143+
[Prepend] = STATE_BREAK,
144+
[SpacingMark] = STATE_BREAK,
145+
[L] = STATE_BREAK,
146+
[V] = STATE_BREAK,
147+
[T] = STATE_BREAK,
148+
[LV] = STATE_BREAK,
149+
[LVT] = STATE_BREAK,
150+
[E_Base] = STATE_BREAK,
151+
[E_Modifier] = STATE_BREAK,
152+
[Glue_After_Zwj] = STATE_BREAK,
153+
[E_Base_GAZ] = STATE_BREAK,
154+
[Any] = STATE_BREAK,
155+
[eot] = STATE_BREAK},
156+
[STATE_L] = {[CR] = STATE_BREAK,
157+
[LF] = STATE_BREAK,
158+
[Control] = STATE_BREAK,
159+
[Extend] = STATE_Any,
160+
[ZWJ] = STATE_ZWJ,
161+
[Regional_Indicator] = STATE_BREAK,
162+
[Prepend] = STATE_BREAK,
163+
[SpacingMark] = STATE_Any,
164+
[L] = STATE_L,
165+
[V] = STATE_V_or_LV,
166+
[T] = STATE_BREAK,
167+
[LV] = STATE_V_or_LV,
168+
[LVT] = STATE_T_or_LVT,
169+
[E_Base] = STATE_BREAK,
170+
[E_Modifier] = STATE_BREAK,
171+
[Glue_After_Zwj] = STATE_BREAK,
172+
[E_Base_GAZ] = STATE_BREAK,
173+
[Any] = STATE_BREAK,
174+
[eot] = STATE_BREAK},
175+
[STATE_V_or_LV] = {[CR] = STATE_BREAK,
176+
[LF] = STATE_BREAK,
177+
[Control] = STATE_BREAK,
178+
[Extend] = STATE_Any,
179+
[ZWJ] = STATE_ZWJ,
180+
[Regional_Indicator] = STATE_BREAK,
181+
[Prepend] = STATE_BREAK,
182+
[SpacingMark] = STATE_Any,
183+
[L] = STATE_BREAK,
184+
[V] = STATE_V_or_LV,
185+
[T] = STATE_T_or_LVT,
186+
[LV] = STATE_BREAK,
187+
[LVT] = STATE_BREAK,
188+
[E_Base] = STATE_BREAK,
189+
[E_Modifier] = STATE_BREAK,
190+
[Glue_After_Zwj] = STATE_BREAK,
191+
[E_Base_GAZ] = STATE_BREAK,
192+
[Any] = STATE_BREAK,
193+
[eot] = STATE_BREAK},
194+
[STATE_T_or_LVT] = {[CR] = STATE_BREAK,
195+
[LF] = STATE_BREAK,
196+
[Control] = STATE_BREAK,
197+
[Extend] = STATE_Any,
198+
[ZWJ] = STATE_ZWJ,
199+
[Regional_Indicator] = STATE_BREAK,
200+
[Prepend] = STATE_BREAK,
201+
[SpacingMark] = STATE_Any,
202+
[L] = STATE_BREAK,
203+
[V] = STATE_BREAK,
204+
[T] = STATE_T_or_LVT,
205+
[LV] = STATE_BREAK,
206+
[LVT] = STATE_BREAK,
207+
[E_Base] = STATE_BREAK,
208+
[E_Modifier] = STATE_BREAK,
209+
[Glue_After_Zwj] = STATE_BREAK,
210+
[E_Base_GAZ] = STATE_BREAK,
211+
[Any] = STATE_BREAK,
212+
[eot] = STATE_BREAK},
213+
[STATE_Prepend] = {[CR] = STATE_BREAK,
214+
[LF] = STATE_BREAK,
215+
[Control] = STATE_BREAK,
216+
[Extend] = STATE_Any,
217+
[ZWJ] = STATE_ZWJ,
218+
[Regional_Indicator] = STATE_RI_1,
219+
[Prepend] = STATE_Prepend,
220+
[SpacingMark] = STATE_Any,
221+
[L] = STATE_L,
222+
[V] = STATE_V_or_LV,
223+
[T] = STATE_T_or_LVT,
224+
[LV] = STATE_V_or_LV,
225+
[LVT] = STATE_T_or_LVT,
226+
[E_Base] = STATE_Emoji,
227+
[E_Modifier] = STATE_Any,
228+
[Glue_After_Zwj] = STATE_Any,
229+
[E_Base_GAZ] = STATE_Emoji,
230+
[Any] = STATE_Any,
231+
[eot] = STATE_BREAK},
232+
[STATE_ZWJ] = {[CR] = STATE_BREAK,
233+
[LF] = STATE_BREAK,
234+
[Control] = STATE_BREAK,
235+
[Extend] = STATE_Any,
236+
[ZWJ] = STATE_ZWJ,
237+
[Regional_Indicator] = STATE_BREAK,
238+
[Prepend] = STATE_BREAK,
239+
[SpacingMark] = STATE_Any,
240+
[L] = STATE_BREAK,
241+
[V] = STATE_BREAK,
242+
[T] = STATE_BREAK,
243+
[LV] = STATE_BREAK,
244+
[LVT] = STATE_BREAK,
245+
[E_Base] = STATE_BREAK,
246+
[E_Modifier] = STATE_BREAK,
247+
[Glue_After_Zwj] = STATE_Any,
248+
[E_Base_GAZ] = STATE_Emoji,
249+
[Any] = STATE_BREAK,
250+
[eot] = STATE_BREAK},
251+
[STATE_Emoji] = {[CR] = STATE_BREAK,
252+
[LF] = STATE_BREAK,
253+
[Control] = STATE_BREAK,
254+
[Extend] = STATE_Emoji,
255+
[ZWJ] = STATE_ZWJ,
256+
[Regional_Indicator] = STATE_BREAK,
257+
[Prepend] = STATE_BREAK,
258+
[SpacingMark] = STATE_Any,
259+
[L] = STATE_BREAK,
260+
[V] = STATE_BREAK,
261+
[T] = STATE_BREAK,
262+
[LV] = STATE_BREAK,
263+
[LVT] = STATE_BREAK,
264+
[E_Base] = STATE_BREAK,
265+
[E_Modifier] = STATE_Any,
266+
[Glue_After_Zwj] = STATE_BREAK,
267+
[E_Base_GAZ] = STATE_BREAK,
268+
[Any] = STATE_BREAK,
269+
[eot] = STATE_BREAK},
270+
[STATE_RI_1] = {[CR] = STATE_BREAK,
271+
[LF] = STATE_BREAK,
272+
[Control] = STATE_BREAK,
273+
[Extend] = STATE_Any,
274+
[ZWJ] = STATE_ZWJ,
275+
[Regional_Indicator] = STATE_RI_2,
276+
[Prepend] = STATE_BREAK,
277+
[SpacingMark] = STATE_Any,
278+
[L] = STATE_BREAK,
279+
[V] = STATE_BREAK,
280+
[T] = STATE_BREAK,
281+
[LV] = STATE_BREAK,
282+
[LVT] = STATE_BREAK,
283+
[E_Base] = STATE_BREAK,
284+
[E_Modifier] = STATE_BREAK,
285+
[Glue_After_Zwj] = STATE_BREAK,
286+
[E_Base_GAZ] = STATE_BREAK,
287+
[Any] = STATE_BREAK,
288+
[eot] = STATE_BREAK},
289+
[STATE_RI_2] = {[CR] = STATE_BREAK,
290+
[LF] = STATE_BREAK,
291+
[Control] = STATE_BREAK,
292+
[Extend] = STATE_Any,
293+
[ZWJ] = STATE_ZWJ,
294+
[Regional_Indicator] = STATE_BREAK,
295+
[Prepend] = STATE_BREAK,
296+
[SpacingMark] = STATE_Any,
297+
[L] = STATE_BREAK,
298+
[V] = STATE_BREAK,
299+
[T] = STATE_BREAK,
300+
[LV] = STATE_BREAK,
301+
[LVT] = STATE_BREAK,
302+
[E_Base] = STATE_BREAK,
303+
[E_Modifier] = STATE_BREAK,
304+
[Glue_After_Zwj] = STATE_BREAK,
305+
[E_Base_GAZ] = STATE_BREAK,
306+
[Any] = STATE_BREAK,
307+
[eot] = STATE_BREAK},
308+
[STATE_Any] = {[CR] = STATE_BREAK,
309+
[LF] = STATE_BREAK,
310+
[Control] = STATE_BREAK,
311+
[Extend] = STATE_Any,
312+
[ZWJ] = STATE_ZWJ,
313+
[Regional_Indicator] = STATE_BREAK,
314+
[Prepend] = STATE_BREAK,
315+
[SpacingMark] = STATE_Any,
316+
[L] = STATE_BREAK,
317+
[V] = STATE_BREAK,
318+
[T] = STATE_BREAK,
319+
[LV] = STATE_BREAK,
320+
[LVT] = STATE_BREAK,
321+
[E_Base] = STATE_BREAK,
322+
[E_Modifier] = STATE_BREAK,
323+
[Glue_After_Zwj] = STATE_BREAK,
324+
[E_Base_GAZ] = STATE_BREAK,
325+
[Any] = STATE_BREAK,
326+
[eot] = STATE_BREAK},
327+
};

0 commit comments

Comments
 (0)