@@ -47,6 +47,63 @@ static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) {
47
47
48
48
/* --- Unicode Type ------------------------------------------------------- */
49
49
50
+ struct _PyUnicodeObject_state {
51
+ /* If interned is non-zero, the two references from the
52
+ dictionary to this object are *not* counted in ob_refcnt.
53
+ The possible values here are:
54
+ 0: Not Interned
55
+ 1: Interned
56
+ 2: Interned and Immortal
57
+ 3: Interned, Immortal, and Static
58
+ This categorization allows the runtime to determine the right
59
+ cleanup mechanism at runtime shutdown. */
60
+ #ifdef Py_GIL_DISABLED
61
+ // Needs to be accessed atomically, so can't be a bit field.
62
+ unsigned char interned ;
63
+ #else
64
+ unsigned int interned :2 ;
65
+ #endif
66
+ /* Character size:
67
+
68
+ - PyUnicode_1BYTE_KIND (1):
69
+
70
+ * character type = Py_UCS1 (8 bits, unsigned)
71
+ * all characters are in the range U+0000-U+00FF (latin1)
72
+ * if ascii is set, all characters are in the range U+0000-U+007F
73
+ (ASCII), otherwise at least one character is in the range
74
+ U+0080-U+00FF
75
+
76
+ - PyUnicode_2BYTE_KIND (2):
77
+
78
+ * character type = Py_UCS2 (16 bits, unsigned)
79
+ * all characters are in the range U+0000-U+FFFF (BMP)
80
+ * at least one character is in the range U+0100-U+FFFF
81
+
82
+ - PyUnicode_4BYTE_KIND (4):
83
+
84
+ * character type = Py_UCS4 (32 bits, unsigned)
85
+ * all characters are in the range U+0000-U+10FFFF
86
+ * at least one character is in the range U+10000-U+10FFFF
87
+ */
88
+ unsigned int kind :3 ;
89
+ /* Compact is with respect to the allocation scheme. Compact unicode
90
+ objects only require one memory block while non-compact objects use
91
+ one block for the PyUnicodeObject struct and another for its data
92
+ buffer. */
93
+ unsigned int compact :1 ;
94
+ /* The string only contains characters in the range U+0000-U+007F (ASCII)
95
+ and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
96
+ set, use the PyASCIIObject structure. */
97
+ unsigned int ascii :1 ;
98
+ /* The object is statically allocated. */
99
+ unsigned int statically_allocated :1 ;
100
+ #ifndef Py_GIL_DISABLED
101
+ /* Historical: padding to ensure that PyUnicode_DATA() is always aligned to
102
+ 4 bytes (see issue gh-63736 on m68k) */
103
+ unsigned int :24 ;
104
+ #endif
105
+ };
106
+
50
107
/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
51
108
structure. state.ascii and state.compact are set, and the data
52
109
immediately follow the structure. utf8_length can be found
@@ -99,67 +156,8 @@ typedef struct {
99
156
PyObject_HEAD
100
157
Py_ssize_t length ; /* Number of code points in the string */
101
158
Py_hash_t hash ; /* Hash value; -1 if not set */
102
- #ifdef Py_GIL_DISABLED
103
- /* Ensure 4 byte alignment for PyUnicode_DATA(), see gh-63736 on m68k.
104
- In the non-free-threaded build, we'll use explicit padding instead */
105
- _Py_ALIGN_AS (4 )
106
- #endif
107
- struct {
108
- /* If interned is non-zero, the two references from the
109
- dictionary to this object are *not* counted in ob_refcnt.
110
- The possible values here are:
111
- 0: Not Interned
112
- 1: Interned
113
- 2: Interned and Immortal
114
- 3: Interned, Immortal, and Static
115
- This categorization allows the runtime to determine the right
116
- cleanup mechanism at runtime shutdown. */
117
- #ifdef Py_GIL_DISABLED
118
- // Needs to be accessed atomically, so can't be a bit field.
119
- unsigned char interned ;
120
- #else
121
- unsigned int interned :2 ;
122
- #endif
123
- /* Character size:
124
-
125
- - PyUnicode_1BYTE_KIND (1):
126
-
127
- * character type = Py_UCS1 (8 bits, unsigned)
128
- * all characters are in the range U+0000-U+00FF (latin1)
129
- * if ascii is set, all characters are in the range U+0000-U+007F
130
- (ASCII), otherwise at least one character is in the range
131
- U+0080-U+00FF
132
-
133
- - PyUnicode_2BYTE_KIND (2):
134
-
135
- * character type = Py_UCS2 (16 bits, unsigned)
136
- * all characters are in the range U+0000-U+FFFF (BMP)
137
- * at least one character is in the range U+0100-U+FFFF
138
-
139
- - PyUnicode_4BYTE_KIND (4):
140
-
141
- * character type = Py_UCS4 (32 bits, unsigned)
142
- * all characters are in the range U+0000-U+10FFFF
143
- * at least one character is in the range U+10000-U+10FFFF
144
- */
145
- unsigned int kind :3 ;
146
- /* Compact is with respect to the allocation scheme. Compact unicode
147
- objects only require one memory block while non-compact objects use
148
- one block for the PyUnicodeObject struct and another for its data
149
- buffer. */
150
- unsigned int compact :1 ;
151
- /* The string only contains characters in the range U+0000-U+007F (ASCII)
152
- and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
153
- set, use the PyASCIIObject structure. */
154
- unsigned int ascii :1 ;
155
- /* The object is statically allocated. */
156
- unsigned int statically_allocated :1 ;
157
- #ifndef Py_GIL_DISABLED
158
- /* Padding to ensure that PyUnicode_DATA() is always aligned to
159
- 4 bytes (see issue gh-63736 on m68k) */
160
- unsigned int :24 ;
161
- #endif
162
- } state ;
159
+ /* Ensure 4 byte alignment for PyUnicode_DATA(), see gh-63736 on m68k. */
160
+ _Py_ALIGNED_DEF (4 , struct _PyUnicodeObject_state ) state ;
163
161
} PyASCIIObject ;
164
162
165
163
/* Non-ASCII strings allocated through PyUnicode_New use the
0 commit comments