Skip to content

Commit 14c79d4

Browse files
committed
gh-89188: replace bitfield with struct fields in PyASCIIObject
1 parent 12226be commit 14c79d4

File tree

7 files changed

+102
-93
lines changed

7 files changed

+102
-93
lines changed

Include/cpython/unicodeobject.h

Lines changed: 40 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -97,46 +97,41 @@ typedef struct {
9797
PyObject_HEAD
9898
Py_ssize_t length; /* Number of code points in the string */
9999
Py_hash_t hash; /* Hash value; -1 if not set */
100-
struct {
101-
/* If interned is set, the two references from the
102-
dictionary to this object are *not* counted in ob_refcnt. */
103-
unsigned int interned:1;
104-
/* Character size:
105-
106-
- PyUnicode_1BYTE_KIND (1):
107-
108-
* character type = Py_UCS1 (8 bits, unsigned)
109-
* all characters are in the range U+0000-U+00FF (latin1)
110-
* if ascii is set, all characters are in the range U+0000-U+007F
111-
(ASCII), otherwise at least one character is in the range
112-
U+0080-U+00FF
113-
114-
- PyUnicode_2BYTE_KIND (2):
115-
116-
* character type = Py_UCS2 (16 bits, unsigned)
117-
* all characters are in the range U+0000-U+FFFF (BMP)
118-
* at least one character is in the range U+0100-U+FFFF
119-
120-
- PyUnicode_4BYTE_KIND (4):
121-
122-
* character type = Py_UCS4 (32 bits, unsigned)
123-
* all characters are in the range U+0000-U+10FFFF
124-
* at least one character is in the range U+10000-U+10FFFF
125-
*/
126-
unsigned int kind:3;
127-
/* Compact is with respect to the allocation scheme. Compact unicode
128-
objects only require one memory block while non-compact objects use
129-
one block for the PyUnicodeObject struct and another for its data
130-
buffer. */
131-
unsigned int compact:1;
132-
/* The string only contains characters in the range U+0000-U+007F (ASCII)
133-
and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
134-
set, use the PyASCIIObject structure. */
135-
unsigned int ascii:1;
136-
/* Padding to ensure that PyUnicode_DATA() is always aligned to
137-
4 bytes (see issue #19537 on m68k). */
138-
unsigned int :26;
139-
} state;
100+
/* If interned is set, the two references from the
101+
dictionary to this object are *not* counted in ob_refcnt. */
102+
uint8_t interned;
103+
/* Character size:
104+
105+
- PyUnicode_1BYTE_KIND (1):
106+
107+
* character type = Py_UCS1 (8 bits, unsigned)
108+
* all characters are in the range U+0000-U+00FF (latin1)
109+
* if ascii is set, all characters are in the range U+0000-U+007F
110+
(ASCII), otherwise at least one character is in the range
111+
U+0080-U+00FF
112+
113+
- PyUnicode_2BYTE_KIND (2):
114+
115+
* character type = Py_UCS2 (16 bits, unsigned)
116+
* all characters are in the range U+0000-U+FFFF (BMP)
117+
* at least one character is in the range U+0100-U+FFFF
118+
119+
- PyUnicode_4BYTE_KIND (4):
120+
121+
* character type = Py_UCS4 (32 bits, unsigned)
122+
* all characters are in the range U+0000-U+10FFFF
123+
* at least one character is in the range U+10000-U+10FFFF
124+
*/
125+
uint8_t kind;
126+
/* Compact is with respect to the allocation scheme. Compact unicode
127+
objects only require one memory block while non-compact objects use
128+
one block for the PyUnicodeObject struct and another for its data
129+
buffer. */
130+
uint8_t compact;
131+
/* The string only contains characters in the range U+0000-U+007F (ASCII)
132+
and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
133+
set, use the PyASCIIObject structure. */
134+
uint8_t ascii;
140135
} PyASCIIObject;
141136

142137
/* Non-ASCII strings allocated through PyUnicode_New use the
@@ -178,15 +173,9 @@ PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
178173

179174
/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
180175

181-
/* Values for PyASCIIObject.state: */
182-
183-
/* Interning state. */
184-
#define SSTATE_NOT_INTERNED 0
185-
#define SSTATE_INTERNED_MORTAL 1
186-
187176
/* Use only if you know it's a string */
188177
static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) {
189-
return _PyASCIIObject_CAST(op)->state.interned;
178+
return _PyASCIIObject_CAST(op)->interned;
190179
}
191180
#define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))
192181

@@ -200,21 +189,21 @@ static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) {
200189
string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
201190
ready. */
202191
static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) {
203-
return _PyASCIIObject_CAST(op)->state.ascii;
192+
return _PyASCIIObject_CAST(op)->ascii;
204193
}
205194
#define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))
206195

207196
/* Return true if the string is compact or 0 if not.
208197
No type checks or Ready calls are performed. */
209198
static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) {
210-
return _PyASCIIObject_CAST(op)->state.compact;
199+
return _PyASCIIObject_CAST(op)->compact;
211200
}
212201
#define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))
213202

214203
/* Return true if the string is a compact ASCII string (use PyASCIIObject
215204
structure), or 0 if not. No type checks or Ready calls are performed. */
216205
static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) {
217-
return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op));
206+
return (_PyASCIIObject_CAST(op)->ascii && PyUnicode_IS_COMPACT(op));
218207
}
219208
#define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))
220209

@@ -231,7 +220,7 @@ enum PyUnicode_Kind {
231220
// new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and
232221
// unsigned numbers) where kind type is an int or on
233222
// "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned).
234-
#define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind)
223+
#define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->kind)
235224

236225
/* Return a void pointer to the raw unicode buffer. */
237226
static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) {

Include/internal/pycore_runtime_init.h

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -155,11 +155,9 @@ extern PyTypeObject _PyExc_MemoryError;
155155
.ob_base = _PyObject_IMMORTAL_INIT(&PyUnicode_Type), \
156156
.length = sizeof(LITERAL) - 1, \
157157
.hash = -1, \
158-
.state = { \
159-
.kind = 1, \
160-
.compact = 1, \
161-
.ascii = (ASCII), \
162-
}, \
158+
.kind = 1, \
159+
.compact = 1, \
160+
.ascii = (ASCII), \
163161
}
164162
#define _PyASCIIObject_INIT(LITERAL) \
165163
{ \

Lib/test/test_capi/test_misc.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1555,5 +1555,11 @@ def func2(x=None):
15551555
self.do_test(func2)
15561556

15571557

1558+
class Test_UnicodeObjectAlignment(unittest.TestCase):
1559+
1560+
def test_unicodeobject_data_alignment(self):
1561+
_testinternalcapi.check_compactunicodeobject_data_alignment()
1562+
1563+
15581564
if __name__ == "__main__":
15591565
unittest.main()

Modules/_testinternalcapi.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -684,6 +684,21 @@ clear_extension(PyObject *self, PyObject *args)
684684
Py_RETURN_NONE;
685685
}
686686

687+
static PyObject *
688+
check_compactunicodeobject_data_alignment()
689+
{
690+
size_t data_offset = sizeof(PyCompactUnicodeObject);
691+
if (data_offset % 4 != 0) {
692+
// This is required so that the data (which immediately follows a
693+
// compact unicode offset) is correctly aligned in the largest case (UCS_4)
694+
PyErr_Format(PyExc_AssertionError,
695+
"PyCompactUnicodeObject size offset is %i, needs to be multiple of 4 bytes",
696+
data_offset);
697+
return NULL;
698+
}
699+
Py_RETURN_NONE;
700+
}
701+
687702

688703
static PyMethodDef module_functions[] = {
689704
{"get_configs", get_configs, METH_NOARGS},
@@ -707,6 +722,7 @@ static PyMethodDef module_functions[] = {
707722
_TESTINTERNALCAPI_OPTIMIZE_CFG_METHODDEF
708723
{"get_interp_settings", get_interp_settings, METH_VARARGS, NULL},
709724
{"clear_extension", clear_extension, METH_VARARGS, NULL},
725+
{"check_compactunicodeobject_data_alignment", check_compactunicodeobject_data_alignment, METH_NOARGS, NULL},
710726
{NULL, NULL} /* sentinel */
711727
};
712728

Objects/unicodeobject.c

Lines changed: 27 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -129,13 +129,16 @@ extern "C" {
129129

130130
#define _PyUnicode_LENGTH(op) \
131131
(_PyASCIIObject_CAST(op)->length)
132-
#define _PyUnicode_STATE(op) \
133-
(_PyASCIIObject_CAST(op)->state)
134132
#define _PyUnicode_HASH(op) \
135133
(_PyASCIIObject_CAST(op)->hash)
134+
#define _PyUnicode_INTERNED(op) \
135+
(_PyASCIIObject_CAST(op)->interned)
136136
#define _PyUnicode_KIND(op) \
137-
(assert(_PyUnicode_CHECK(op)), \
138-
_PyASCIIObject_CAST(op)->state.kind)
137+
(_PyASCIIObject_CAST(op)->kind)
138+
#define _PyUnicode_COMPACT(op) \
139+
(_PyASCIIObject_CAST(op)->compact)
140+
#define _PyUnicode_ASCII(op) \
141+
(_PyASCIIObject_CAST(op)->ascii)
139142
#define _PyUnicode_GET_LENGTH(op) \
140143
(assert(_PyUnicode_CHECK(op)), \
141144
_PyASCIIObject_CAST(op)->length)
@@ -497,21 +500,21 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
497500
CHECK(PyUnicode_Check(op));
498501

499502
PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
500-
int kind = ascii->state.kind;
503+
int kind = ascii->kind;
501504

502-
if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
505+
if (ascii->ascii == 1 && ascii->compact == 1) {
503506
CHECK(kind == PyUnicode_1BYTE_KIND);
504507
}
505508
else {
506509
PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
507510
void *data;
508511

509-
if (ascii->state.compact == 1) {
512+
if (ascii->compact == 1) {
510513
data = compact + 1;
511514
CHECK(kind == PyUnicode_1BYTE_KIND
512515
|| kind == PyUnicode_2BYTE_KIND
513516
|| kind == PyUnicode_4BYTE_KIND);
514-
CHECK(ascii->state.ascii == 0);
517+
CHECK(ascii->ascii == 0);
515518
CHECK(compact->utf8 != data);
516519
}
517520
else {
@@ -521,9 +524,9 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
521524
CHECK(kind == PyUnicode_1BYTE_KIND
522525
|| kind == PyUnicode_2BYTE_KIND
523526
|| kind == PyUnicode_4BYTE_KIND);
524-
CHECK(ascii->state.compact == 0);
527+
CHECK(ascii->compact == 0);
525528
CHECK(data != NULL);
526-
if (ascii->state.ascii) {
529+
if (ascii->ascii) {
527530
CHECK(compact->utf8 == data);
528531
CHECK(compact->utf8_length == ascii->length);
529532
}
@@ -551,7 +554,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
551554
maxchar = ch;
552555
}
553556
if (kind == PyUnicode_1BYTE_KIND) {
554-
if (ascii->state.ascii == 0) {
557+
if (ascii->ascii == 0) {
555558
CHECK(maxchar >= 128);
556559
CHECK(maxchar <= 255);
557560
}
@@ -1108,9 +1111,9 @@ _PyUnicode_Dump(PyObject *op)
11081111
PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
11091112
const void *data;
11101113

1111-
if (ascii->state.compact)
1114+
if (ascii->compact)
11121115
{
1113-
if (ascii->state.ascii)
1116+
if (ascii->ascii)
11141117
data = (ascii + 1);
11151118
else
11161119
data = (compact + 1);
@@ -1119,7 +1122,7 @@ _PyUnicode_Dump(PyObject *op)
11191122
data = unicode->data.any;
11201123
printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
11211124

1122-
if (!ascii->state.ascii) {
1125+
if (!ascii->ascii) {
11231126
printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
11241127
}
11251128
printf(", data=%p\n", data);
@@ -1195,10 +1198,10 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
11951198
data = unicode + 1;
11961199
_PyUnicode_LENGTH(unicode) = size;
11971200
_PyUnicode_HASH(unicode) = -1;
1198-
_PyUnicode_STATE(unicode).interned = 0;
1199-
_PyUnicode_STATE(unicode).kind = kind;
1200-
_PyUnicode_STATE(unicode).compact = 1;
1201-
_PyUnicode_STATE(unicode).ascii = is_ascii;
1201+
_PyUnicode_INTERNED(unicode) = 0;
1202+
_PyUnicode_KIND(unicode) = kind;
1203+
_PyUnicode_COMPACT(unicode) = 1;
1204+
_PyUnicode_ASCII(unicode) = is_ascii;
12021205
if (is_ascii) {
12031206
((char*)data)[size] = 0;
12041207
}
@@ -14372,10 +14375,10 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
1437214375
#else
1437314376
_PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
1437414377
#endif
14375-
_PyUnicode_STATE(self).interned = 0;
14376-
_PyUnicode_STATE(self).kind = kind;
14377-
_PyUnicode_STATE(self).compact = 0;
14378-
_PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
14378+
_PyUnicode_INTERNED(self) = 0;
14379+
_PyUnicode_KIND(self) = kind;
14380+
_PyUnicode_COMPACT(self) = 0;
14381+
_PyUnicode_ASCII(self) = _PyUnicode_ASCII(unicode);
1437914382
_PyUnicode_UTF8_LENGTH(self) = 0;
1438014383
_PyUnicode_UTF8(self) = NULL;
1438114384
_PyUnicode_DATA_ANY(self) = NULL;
@@ -14624,7 +14627,7 @@ PyUnicode_InternInPlace(PyObject **p)
1462414627
refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
1462514628
this. */
1462614629
Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
14627-
_PyUnicode_STATE(s).interned = 1;
14630+
_PyUnicode_INTERNED(s) = 1;
1462814631
}
1462914632

1463014633
// Function kept for the stable ABI.
@@ -14683,7 +14686,7 @@ _PyUnicode_ClearInterned(PyInterpreterState *interp)
1468314686
total_length += PyUnicode_GET_LENGTH(s);
1468414687
#endif
1468514688

14686-
_PyUnicode_STATE(s).interned = 0;
14689+
_PyUnicode_INTERNED(s) = 0;
1468714690
}
1468814691
#ifdef INTERNED_STATS
1468914692
fprintf(stderr,

Python/traceback.c

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1092,9 +1092,9 @@ _Py_DumpASCII(int fd, PyObject *text)
10921092
return;
10931093

10941094
size = ascii->length;
1095-
kind = ascii->state.kind;
1096-
if (ascii->state.compact) {
1097-
if (ascii->state.ascii)
1095+
kind = ascii->kind;
1096+
if (ascii->compact) {
1097+
if (ascii->ascii)
10981098
data = ascii + 1;
10991099
else
11001100
data = _PyCompactUnicodeObject_CAST(text) + 1;
@@ -1114,7 +1114,7 @@ _Py_DumpASCII(int fd, PyObject *text)
11141114
}
11151115

11161116
// Is an ASCII string?
1117-
if (ascii->state.ascii) {
1117+
if (ascii->ascii) {
11181118
assert(kind == PyUnicode_1BYTE_KIND);
11191119
char *str = data;
11201120

@@ -1341,4 +1341,3 @@ _Py_DumpTracebackThreads(int fd, PyInterpreterState *interp,
13411341

13421342
return NULL;
13431343
}
1344-

Tools/build/deepfreeze.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -198,10 +198,9 @@ def generate_unicode(self, name: str, s: str) -> str:
198198
self.object_head("PyUnicode_Type")
199199
self.write(f".length = {len(s)},")
200200
self.write(".hash = -1,")
201-
with self.block(".state =", ","):
202-
self.write(".kind = 1,")
203-
self.write(".compact = 1,")
204-
self.write(".ascii = 1,")
201+
self.write(".kind = 1,")
202+
self.write(".compact = 1,")
203+
self.write(".ascii = 1,")
205204
self.write(f"._data = {make_string_literal(s.encode('ascii'))},")
206205
return f"& {name}._ascii.ob_base"
207206
else:
@@ -210,10 +209,9 @@ def generate_unicode(self, name: str, s: str) -> str:
210209
self.object_head("PyUnicode_Type")
211210
self.write(f".length = {len(s)},")
212211
self.write(".hash = -1,")
213-
with self.block(".state =", ","):
214-
self.write(f".kind = {kind},")
215-
self.write(".compact = 1,")
216-
self.write(".ascii = 0,")
212+
self.write(f".kind = {kind},")
213+
self.write(".compact = 1,")
214+
self.write(".ascii = 0,")
217215
utf8 = s.encode('utf-8')
218216
self.write(f'.utf8 = {make_string_literal(utf8)},')
219217
self.write(f'.utf8_length = {len(utf8)},')

0 commit comments

Comments
 (0)