Skip to content

Commit f262297

Browse files
emmatypingpicnixz
andauthored
gh-139877: Use PyBytesWriter in pycore_blocks_output_buffer.h (#139976)
Previously, the _BlocksOutputBuffer code creates a list of bytes objects to handle the output data from compression libraries. This ends up being slow due to the output buffer code needing to copy each bytes element of the list into the final bytes object buffer at the end of compression. The new PyBytesWriter API introduced in PEP 782 is an ergonomic and fast method of writing data into a buffer that will later turn into a bytes object. Benchmarks show that using the PyBytesWriter API is 10-30% faster for decompression across a variety of settings. The performance gains are greatest when the decompressor is very performant, such as for Zstandard (and likely zlib-ng). Otherwise the decompressor can bottleneck decompression and the gains are more modest, but still sizable (e.g. 10% faster for zlib)! Co-authored-by: Bénédikt Tran <[email protected]>
1 parent 4044255 commit f262297

File tree

7 files changed

+49
-109
lines changed

7 files changed

+49
-109
lines changed

Include/internal/pycore_blocks_output_buffer.h

Lines changed: 32 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,14 @@ extern "C" {
4545
#endif
4646

4747
typedef struct {
48-
// List of bytes objects
49-
PyObject *list;
48+
// Bytes writer managing output buffer
49+
PyBytesWriter *writer;
5050
// Number of whole allocated size
5151
Py_ssize_t allocated;
52-
// Max length of the buffer, negative number means unlimited length.
52+
// Max length of the buffer, negative number means unlimited length
5353
Py_ssize_t max_length;
54+
// Number of blocks of bytes. Used to calculate next allocation size
55+
size_t num_blocks;
5456
} _BlocksOutputBuffer;
5557

5658
static const char unable_allocate_msg[] = "Unable to allocate output buffer.";
@@ -107,11 +109,10 @@ _BlocksOutputBuffer_InitAndGrow(_BlocksOutputBuffer *buffer,
107109
const Py_ssize_t max_length,
108110
void **next_out)
109111
{
110-
PyObject *b;
111112
Py_ssize_t block_size;
112113

113-
// ensure .list was set to NULL
114-
assert(buffer->list == NULL);
114+
// ensure .writer was set to NULL
115+
assert(buffer->writer == NULL);
115116

116117
// get block size
117118
if (0 <= max_length && max_length < BUFFER_BLOCK_SIZE[0]) {
@@ -120,25 +121,17 @@ _BlocksOutputBuffer_InitAndGrow(_BlocksOutputBuffer *buffer,
120121
block_size = BUFFER_BLOCK_SIZE[0];
121122
}
122123

123-
// the first block
124-
b = PyBytes_FromStringAndSize(NULL, block_size);
125-
if (b == NULL) {
124+
buffer->writer = PyBytesWriter_Create(block_size);
125+
if (buffer->writer == NULL) {
126126
return -1;
127127
}
128128

129-
// create the list
130-
buffer->list = PyList_New(1);
131-
if (buffer->list == NULL) {
132-
Py_DECREF(b);
133-
return -1;
134-
}
135-
PyList_SET_ITEM(buffer->list, 0, b);
136-
137129
// set variables
138130
buffer->allocated = block_size;
139131
buffer->max_length = max_length;
132+
buffer->num_blocks = 1;
140133

141-
*next_out = PyBytes_AS_STRING(b);
134+
*next_out = PyBytesWriter_GetData(buffer->writer);
142135
return block_size;
143136
}
144137

@@ -155,31 +148,21 @@ _BlocksOutputBuffer_InitWithSize(_BlocksOutputBuffer *buffer,
155148
const Py_ssize_t init_size,
156149
void **next_out)
157150
{
158-
PyObject *b;
159151

160-
// ensure .list was set to NULL
161-
assert(buffer->list == NULL);
152+
// ensure .writer was set to NULL
153+
assert(buffer->writer == NULL);
162154

163-
// the first block
164-
b = PyBytes_FromStringAndSize(NULL, init_size);
165-
if (b == NULL) {
166-
PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
155+
buffer->writer = PyBytesWriter_Create(init_size);
156+
if (buffer->writer == NULL) {
167157
return -1;
168158
}
169159

170-
// create the list
171-
buffer->list = PyList_New(1);
172-
if (buffer->list == NULL) {
173-
Py_DECREF(b);
174-
return -1;
175-
}
176-
PyList_SET_ITEM(buffer->list, 0, b);
177-
178160
// set variables
179161
buffer->allocated = init_size;
180162
buffer->max_length = -1;
163+
buffer->num_blocks = 1;
181164

182-
*next_out = PyBytes_AS_STRING(b);
165+
*next_out = PyBytesWriter_GetData(buffer->writer);
183166
return init_size;
184167
}
185168

@@ -193,8 +176,6 @@ _BlocksOutputBuffer_Grow(_BlocksOutputBuffer *buffer,
193176
void **next_out,
194177
const Py_ssize_t avail_out)
195178
{
196-
PyObject *b;
197-
const Py_ssize_t list_len = Py_SIZE(buffer->list);
198179
Py_ssize_t block_size;
199180

200181
// ensure no gaps in the data
@@ -205,11 +186,10 @@ _BlocksOutputBuffer_Grow(_BlocksOutputBuffer *buffer,
205186
}
206187

207188
// get block size
208-
if (list_len < (Py_ssize_t) Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE)) {
209-
block_size = BUFFER_BLOCK_SIZE[list_len];
210-
} else {
211-
block_size = BUFFER_BLOCK_SIZE[Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE) - 1];
212-
}
189+
size_t maxblock = Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE);
190+
assert(maxblock >= 1);
191+
size_t block_index = Py_MIN(buffer->num_blocks, maxblock - 1);
192+
block_size = BUFFER_BLOCK_SIZE[block_index];
213193

214194
// check max_length
215195
if (buffer->max_length >= 0) {
@@ -229,22 +209,19 @@ _BlocksOutputBuffer_Grow(_BlocksOutputBuffer *buffer,
229209
return -1;
230210
}
231211

232-
// create the block
233-
b = PyBytes_FromStringAndSize(NULL, block_size);
234-
if (b == NULL) {
212+
if (PyBytesWriter_Grow(buffer->writer, block_size)) {
235213
PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
236214
return -1;
237215
}
238-
if (PyList_Append(buffer->list, b) < 0) {
239-
Py_DECREF(b);
240-
return -1;
241-
}
242-
Py_DECREF(b);
216+
217+
Py_ssize_t current_size = buffer->allocated;
243218

244219
// set variables
245220
buffer->allocated += block_size;
221+
buffer->num_blocks += 1;
246222

247-
*next_out = PyBytes_AS_STRING(b);
223+
char *data = PyBytesWriter_GetData(buffer->writer);
224+
*next_out = data + current_size;
248225
return block_size;
249226
}
250227

@@ -265,54 +242,17 @@ static inline PyObject *
265242
_BlocksOutputBuffer_Finish(_BlocksOutputBuffer *buffer,
266243
const Py_ssize_t avail_out)
267244
{
268-
PyObject *result, *block;
269-
const Py_ssize_t list_len = Py_SIZE(buffer->list);
270-
271-
// fast path for single block
272-
if ((list_len == 1 && avail_out == 0) ||
273-
(list_len == 2 && Py_SIZE(PyList_GET_ITEM(buffer->list, 1)) == avail_out))
274-
{
275-
block = PyList_GET_ITEM(buffer->list, 0);
276-
Py_INCREF(block);
277-
278-
Py_CLEAR(buffer->list);
279-
return block;
280-
}
281-
282-
// final bytes object
283-
result = PyBytes_FromStringAndSize(NULL, buffer->allocated - avail_out);
284-
if (result == NULL) {
285-
PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
286-
return NULL;
287-
}
288-
289-
// memory copy
290-
if (list_len > 0) {
291-
char *posi = PyBytes_AS_STRING(result);
292-
293-
// blocks except the last one
294-
Py_ssize_t i = 0;
295-
for (; i < list_len-1; i++) {
296-
block = PyList_GET_ITEM(buffer->list, i);
297-
memcpy(posi, PyBytes_AS_STRING(block), Py_SIZE(block));
298-
posi += Py_SIZE(block);
299-
}
300-
// the last block
301-
block = PyList_GET_ITEM(buffer->list, i);
302-
memcpy(posi, PyBytes_AS_STRING(block), Py_SIZE(block) - avail_out);
303-
} else {
304-
assert(Py_SIZE(result) == 0);
305-
}
306-
307-
Py_CLEAR(buffer->list);
308-
return result;
245+
assert(buffer->writer != NULL);
246+
return PyBytesWriter_FinishWithSize(buffer->writer,
247+
buffer->allocated - avail_out);
309248
}
310249

311250
/* Clean up the buffer when an error occurred. */
312251
static inline void
313252
_BlocksOutputBuffer_OnError(_BlocksOutputBuffer *buffer)
314253
{
315-
Py_CLEAR(buffer->list);
254+
PyBytesWriter_Discard(buffer->writer);
255+
buffer->writer = NULL;
316256
}
317257

318258
#ifdef __cplusplus

Modules/_bz2module.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ static PyObject *
190190
compress(BZ2Compressor *c, char *data, size_t len, int action)
191191
{
192192
PyObject *result;
193-
_BlocksOutputBuffer buffer = {.list = NULL};
193+
_BlocksOutputBuffer buffer = {.writer = NULL};
194194

195195
if (OutputBuffer_InitAndGrow(&buffer, -1, &c->bzs.next_out, &c->bzs.avail_out) < 0) {
196196
goto error;
@@ -429,7 +429,7 @@ decompress_buf(BZ2Decompressor *d, Py_ssize_t max_length)
429429
compare against max_length and PyBytes_GET_SIZE we declare it as
430430
signed */
431431
PyObject *result;
432-
_BlocksOutputBuffer buffer = {.list = NULL};
432+
_BlocksOutputBuffer buffer = {.writer = NULL};
433433
bz_stream *bzs = &d->bzs;
434434

435435
if (OutputBuffer_InitAndGrow(&buffer, max_length, &bzs->next_out, &bzs->avail_out) < 0) {

Modules/_lzmamodule.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -554,7 +554,7 @@ static PyObject *
554554
compress(Compressor *c, uint8_t *data, size_t len, lzma_action action)
555555
{
556556
PyObject *result;
557-
_BlocksOutputBuffer buffer = {.list = NULL};
557+
_BlocksOutputBuffer buffer = {.writer = NULL};
558558
_lzma_state *state = PyType_GetModuleState(Py_TYPE(c));
559559
assert(state != NULL);
560560

@@ -940,7 +940,7 @@ decompress_buf(Decompressor *d, Py_ssize_t max_length)
940940
{
941941
PyObject *result;
942942
lzma_stream *lzs = &d->lzs;
943-
_BlocksOutputBuffer buffer = {.list = NULL};
943+
_BlocksOutputBuffer buffer = {.writer = NULL};
944944
_lzma_state *state = PyType_GetModuleState(Py_TYPE(d));
945945
assert(state != NULL);
946946

Modules/_zstd/buffer.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ static inline int
1616
_OutputBuffer_InitAndGrow(_BlocksOutputBuffer *buffer, ZSTD_outBuffer *ob,
1717
Py_ssize_t max_length)
1818
{
19-
/* Ensure .list was set to NULL */
20-
assert(buffer->list == NULL);
19+
/* Ensure .writer was set to NULL */
20+
assert(buffer->writer == NULL);
2121

2222
Py_ssize_t res = _BlocksOutputBuffer_InitAndGrow(buffer, max_length,
2323
&ob->dst);
@@ -39,8 +39,8 @@ _OutputBuffer_InitWithSize(_BlocksOutputBuffer *buffer, ZSTD_outBuffer *ob,
3939
{
4040
Py_ssize_t block_size;
4141

42-
/* Ensure .list was set to NULL */
43-
assert(buffer->list == NULL);
42+
/* Ensure .writer was set to NULL */
43+
assert(buffer->writer == NULL);
4444

4545
/* Get block size */
4646
if (0 <= max_length && max_length < init_size) {

Modules/_zstd/compressor.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -446,7 +446,7 @@ compress_lock_held(ZstdCompressor *self, Py_buffer *data,
446446
assert(PyMutex_IsLocked(&self->lock));
447447
ZSTD_inBuffer in;
448448
ZSTD_outBuffer out;
449-
_BlocksOutputBuffer buffer = {.list = NULL};
449+
_BlocksOutputBuffer buffer = {.writer = NULL};
450450
size_t zstd_ret;
451451
PyObject *ret;
452452

@@ -527,7 +527,7 @@ compress_mt_continue_lock_held(ZstdCompressor *self, Py_buffer *data)
527527
assert(PyMutex_IsLocked(&self->lock));
528528
ZSTD_inBuffer in;
529529
ZSTD_outBuffer out;
530-
_BlocksOutputBuffer buffer = {.list = NULL};
530+
_BlocksOutputBuffer buffer = {.writer = NULL};
531531
size_t zstd_ret;
532532
PyObject *ret;
533533

Modules/_zstd/decompressor.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ decompress_lock_held(ZstdDecompressor *self, ZSTD_inBuffer *in,
216216
{
217217
size_t zstd_ret;
218218
ZSTD_outBuffer out;
219-
_BlocksOutputBuffer buffer = {.list = NULL};
219+
_BlocksOutputBuffer buffer = {.writer = NULL};
220220
PyObject *ret;
221221

222222
/* Initialize the output buffer */

Modules/zlibmodule.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ zlib_compress_impl(PyObject *module, Py_buffer *data, int level, int wbits)
344344
PyObject *return_value;
345345
int flush;
346346
z_stream zst;
347-
_BlocksOutputBuffer buffer = {.list = NULL};
347+
_BlocksOutputBuffer buffer = {.writer = NULL};
348348

349349
zlibstate *state = get_zlib_state(module);
350350

@@ -445,7 +445,7 @@ zlib_decompress_impl(PyObject *module, Py_buffer *data, int wbits,
445445
Py_ssize_t ibuflen;
446446
int err, flush;
447447
z_stream zst;
448-
_BlocksOutputBuffer buffer = {.list = NULL};
448+
_BlocksOutputBuffer buffer = {.writer = NULL};
449449
_Uint32Window window; // output buffer's UINT32_MAX sliding window
450450

451451
zlibstate *state = get_zlib_state(module);
@@ -774,7 +774,7 @@ zlib_Compress_compress_impl(compobject *self, PyTypeObject *cls,
774774
{
775775
PyObject *return_value;
776776
int err;
777-
_BlocksOutputBuffer buffer = {.list = NULL};
777+
_BlocksOutputBuffer buffer = {.writer = NULL};
778778
zlibstate *state = PyType_GetModuleState(cls);
779779

780780
ENTER_ZLIB(self);
@@ -898,7 +898,7 @@ zlib_Decompress_decompress_impl(compobject *self, PyTypeObject *cls,
898898
int err = Z_OK;
899899
Py_ssize_t ibuflen;
900900
PyObject *return_value;
901-
_BlocksOutputBuffer buffer = {.list = NULL};
901+
_BlocksOutputBuffer buffer = {.writer = NULL};
902902

903903
PyObject *module = PyType_GetModule(cls);
904904
if (module == NULL)
@@ -1005,7 +1005,7 @@ zlib_Compress_flush_impl(compobject *self, PyTypeObject *cls, int mode)
10051005
{
10061006
int err;
10071007
PyObject *return_value;
1008-
_BlocksOutputBuffer buffer = {.list = NULL};
1008+
_BlocksOutputBuffer buffer = {.writer = NULL};
10091009

10101010
zlibstate *state = PyType_GetModuleState(cls);
10111011
/* Flushing with Z_NO_FLUSH is a no-op, so there's no point in
@@ -1267,7 +1267,7 @@ zlib_Decompress_flush_impl(compobject *self, PyTypeObject *cls,
12671267
Py_buffer data;
12681268
PyObject *return_value;
12691269
Py_ssize_t ibuflen;
1270-
_BlocksOutputBuffer buffer = {.list = NULL};
1270+
_BlocksOutputBuffer buffer = {.writer = NULL};
12711271
_Uint32Window window; // output buffer's UINT32_MAX sliding window
12721272

12731273
PyObject *module = PyType_GetModule(cls);

0 commit comments

Comments
 (0)