Skip to content

Commit 44d3ee4

Browse files
committed
track size of individual chunks in ggml_dyn_tallocr and raise max chunks.
revert to use suballocation_block_size as max chunk size for vulkan.
1 parent 8d3c5d9 commit 44d3ee4

File tree

3 files changed

+80
-58
lines changed

3 files changed

+80
-58
lines changed

ggml/src/ggml-alloc.c

Lines changed: 52 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@ enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_te
9595

9696
// dynamic tensor allocator
9797

98+
#define GGML_VBUFFER_MAX_CHUNKS 16
99+
98100
struct free_block {
99101
size_t offset;
100102
size_t size;
@@ -103,8 +105,9 @@ struct free_block {
103105
struct ggml_dyn_tallocr {
104106
size_t alignment;
105107
int n_free_blocks;
108+
int n_chunks;
106109
struct free_block free_blocks[MAX_FREE_BLOCKS];
107-
size_t max_size;
110+
size_t max_size[GGML_VBUFFER_MAX_CHUNKS];
108111
size_t max_chunk_size;
109112

110113
#ifdef GGML_ALLOCATOR_DEBUG
@@ -117,10 +120,21 @@ struct ggml_dyn_tallocr {
117120

118121
// the memory range [0, max_size) is divided into n chunks of size max_chunk_size (with the last chunk possibly being smaller).
119122
// tensor allocations may not cross chunk boundaries.
120-
static void ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, struct free_block * block) {
121-
size_t n_chunks = (alloc->max_size + alloc->max_chunk_size - 1) / alloc->max_chunk_size;
122-
block->offset = n_chunks * alloc->max_chunk_size;
123-
block->size = alloc->max_chunk_size;
123+
static size_t ggml_dyn_tallocr_chunk_index(struct ggml_dyn_tallocr * alloc, size_t offset) {
124+
for (int i = 0; i < alloc->n_chunks; i++) {
125+
if (offset < alloc->max_size[i]) {
126+
return i;
127+
}
128+
}
129+
return alloc->n_chunks - 1;
130+
}
131+
132+
static void ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, struct free_block * block, size_t min_size) {
133+
GGML_ASSERT(alloc->n_chunks >= 1);
134+
block->offset = alloc->max_size[alloc->n_chunks - 1];
135+
block->size = MAX(min_size, alloc->max_chunk_size);
136+
alloc->n_chunks++;
137+
GGML_ASSERT(alloc->n_chunks <= GGML_VBUFFER_MAX_CHUNKS);
124138
}
125139

126140
#ifdef GGML_ALLOCATOR_DEBUG
@@ -149,10 +163,6 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
149163
size = aligned_offset(NULL, size, alloc->alignment);
150164

151165
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
152-
if (size > alloc->max_chunk_size) {
153-
GGML_ABORT("allocation failed: tensor %s (%zu bytes) exceeds maximum backend buffer size (%zu bytes)\n",
154-
tensor->name, size, alloc->max_chunk_size);
155-
}
156166

157167
size_t max_avail = 0;
158168

@@ -172,14 +182,10 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
172182
// the last block represents memory still available in an existing chunk
173183
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
174184
max_avail = MAX(max_avail, block->size);
175-
if (block->size >= size) {
176-
best_fit_block = alloc->n_free_blocks - 1;
177-
} else {
178-
// not enough space in existing chunk, create a new one at the end
179-
best_fit_block = alloc->n_free_blocks;
180-
alloc->n_free_blocks += 1;
181-
GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
182-
ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[alloc->n_free_blocks - 1]);
185+
best_fit_block = alloc->n_free_blocks - 1;
186+
if (block->size < size) {
187+
// not enough space in existing chunk, start the next one
188+
ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[best_fit_block], size);
183189
}
184190
}
185191

@@ -196,7 +202,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
196202
// if there are no remaining blocks all memory in current chunk was used up -> start the next one
197203
if (alloc->n_free_blocks == 0) {
198204
alloc->n_free_blocks = 1;
199-
ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[0]);
205+
ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[0], 0);
200206
}
201207
}
202208

@@ -232,7 +238,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
232238
}
233239
#endif
234240

235-
alloc->max_size = MAX(alloc->max_size, offset + size);
241+
alloc->max_size[alloc->n_chunks-1] = MAX(alloc->max_size[alloc->n_chunks-1], offset + size);
236242

237243
return offset;
238244

@@ -248,13 +254,13 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t
248254
#ifdef GGML_ALLOCATOR_DEBUG
249255
remove_allocated_tensor(alloc, offset, tensor);
250256
#endif
251-
size_t chunk = offset / alloc->max_chunk_size;
257+
size_t chunk = ggml_dyn_tallocr_chunk_index(alloc, offset);
252258

253259
// see if we can merge with an existing block
254260
for (int i = 0; i < alloc->n_free_blocks; i++) {
255261
struct free_block * block = &alloc->free_blocks[i];
256262
// can only merge with blocks within the same chunk
257-
size_t block_chunk = block->offset / alloc->max_chunk_size;
263+
size_t block_chunk = ggml_dyn_tallocr_chunk_index(alloc, block->offset);
258264
if (chunk != block_chunk) {
259265
continue;
260266
}
@@ -264,7 +270,7 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t
264270
// check if we can merge with the next block (within the same chunk)
265271
if (i < alloc->n_free_blocks - 1) {
266272
struct free_block * next = &alloc->free_blocks[i+1];
267-
if (block->offset + block->size == next->offset && block_chunk == (next->offset / alloc->max_chunk_size)) {
273+
if (block->offset + block->size == next->offset && block_chunk == ggml_dyn_tallocr_chunk_index(alloc, next->offset)) {
268274
block->size += next->size;
269275
alloc->n_free_blocks--;
270276
for (int j = i+1; j < alloc->n_free_blocks; j++) {
@@ -281,7 +287,7 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t
281287
// check if we can merge with the previous block (within the same chunk)
282288
if (i > 0) {
283289
struct free_block * prev = &alloc->free_blocks[i-1];
284-
if (prev->offset + prev->size == block->offset && block_chunk == (prev->offset / alloc->max_chunk_size)) {
290+
if (prev->offset + prev->size == block->offset && block_chunk == ggml_dyn_tallocr_chunk_index(alloc, prev->offset)) {
285291
prev->size += block->size;
286292
alloc->n_free_blocks--;
287293
for (int j = i; j < alloc->n_free_blocks; j++) {
@@ -313,9 +319,10 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t
313319

314320
static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
315321
alloc->n_free_blocks = 1;
322+
alloc->n_chunks = 1;
316323
alloc->free_blocks[0].offset = 0;
317324
alloc->free_blocks[0].size = alloc->max_chunk_size;
318-
alloc->max_size = 0;
325+
memset(alloc->max_size, 0, sizeof(alloc->max_size));
319326

320327
if (alloc->free_blocks[0].size == SIZE_MAX) {
321328
alloc->free_blocks[0].size = SIZE_MAX/2; // avoid overflows
@@ -334,8 +341,9 @@ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t m
334341
*alloc = (struct ggml_dyn_tallocr) {
335342
/*.alignment = */ alignment,
336343
/*.n_free_blocks = */ 0,
344+
/*.n_chunks = */ 0,
337345
/*.free_blocks = */ {{0}},
338-
/*.max_size = */ 0,
346+
/*.max_size = */ {0},
339347
/*.max_chunk_size = */ max_buffer_size,
340348
#ifdef GGML_ALLOCATOR_DEBUG
341349
/*.allocated_tensors = */ {{0}},
@@ -352,14 +360,12 @@ static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
352360
}
353361

354362
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
355-
return alloc->max_size;
363+
return alloc->max_size[alloc->n_chunks - 1];
356364
}
357365

358366

359367
// virtual buffer with contiguous memory range, split into multiple backend buffers (chunks)
360368

361-
#define GGML_VBUFFER_MAX_CHUNKS 8
362-
363369
struct vbuffer {
364370
ggml_backend_buffer_type_t buft;
365371
ggml_backend_buffer_t chunks[GGML_VBUFFER_MAX_CHUNKS];
@@ -401,36 +407,32 @@ static size_t ggml_vbuffer_size(struct vbuffer * buf) {
401407
return size;
402408
}
403409

404-
static int ggml_vbuffer_alloc(struct vbuffer * buf, size_t size, enum ggml_backend_buffer_usage usage) {
405-
size_t max_chunk_size = ggml_backend_buft_get_max_size(buf->buft);
406-
if (size > GGML_VBUFFER_MAX_CHUNKS * max_chunk_size) {
407-
return 0;
408-
}
409-
410-
int n = 0;
411-
// always allocate at least 1 chunk even if requested size is 0
412-
while (size > 0 || n == 0) {
413-
GGML_ASSERT(n < GGML_VBUFFER_MAX_CHUNKS);
414-
size_t chunk_size = MIN(size, max_chunk_size);
410+
static bool ggml_vbuffer_alloc(struct vbuffer * buf, const struct ggml_dyn_tallocr * talloc, enum ggml_backend_buffer_usage usage) {
411+
for (int n = 0; n < talloc->n_chunks; n++) {
412+
size_t chunk_size = talloc->max_size[n];
413+
if (n > 0) {
414+
chunk_size -= talloc->max_size[n - 1];
415+
}
415416
buf->chunks[n] = ggml_backend_buft_alloc_buffer(buf->buft, chunk_size);
416417
if (buf->chunks[n] == NULL) {
417418
ggml_vbuffer_free_chunks(buf);
418-
return 0;
419+
return false;
419420
}
420421
ggml_backend_buffer_set_usage(buf->chunks[n], usage);
421-
422-
GGML_ASSERT(size >= chunk_size);
423-
size -= chunk_size;
424-
n += 1;
425422
}
426-
return n;
423+
return true;
427424
}
428425

429426
static void ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct ggml_tensor * tensor, size_t offset) {
430-
size_t max_chunk_size = ggml_backend_buft_get_max_size(buf->buft);
431-
size_t chunk_index = offset / max_chunk_size;
432-
size_t chunk_offset = offset % max_chunk_size;
433-
GGML_ASSERT(chunk_index < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[chunk_index] != NULL);
427+
size_t chunk_index = 0, chunk_offset = offset;
428+
while (chunk_index < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[chunk_index]) {
429+
size_t chunk_size = ggml_backend_buffer_get_size(buf->chunks[chunk_index]);
430+
if (chunk_offset < chunk_size) {
431+
break;
432+
}
433+
chunk_offset -= chunk_size;
434+
chunk_index++;
435+
}
434436

435437
void * base = ggml_backend_buffer_get_base(buf->chunks[chunk_index]);
436438
void * addr = (char *)base + chunk_offset;
@@ -880,7 +882,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
880882
#endif
881883

882884
ggml_vbuffer_free_chunks(galloc->buffers[i]);
883-
if (!ggml_vbuffer_alloc(galloc->buffers[i], new_size, GGML_BACKEND_BUFFER_USAGE_COMPUTE)) {
885+
if (!ggml_vbuffer_alloc(galloc->buffers[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE)) {
884886
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
885887
return false;
886888
}

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11129,7 +11129,7 @@ static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type
1112911129

1113011130
static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
1113111131
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
11132-
return ctx->device->max_memory_allocation_size;
11132+
return ctx->device->suballocation_block_size;
1113311133
}
1113411134

1113511135
static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {

tests/test-alloc.cpp

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ static void check_no_overlap(ggml_cgraph * graph) {
259259
//
260260
// test cases
261261

262-
// scenario where the first backend buffer is completely exhausted and there are further
262+
// Scenario where the first backend buffer is completely exhausted and there are further
263263
// tensors which require a second buffer
264264
static void test_max_size_too_many_tensors() {
265265
dummy_backend backend = dummy_backend_init(16);
@@ -282,7 +282,7 @@ static void test_max_size_too_many_tensors() {
282282
GGML_ASSERT(backend.context->allocated_total() <= 16 + 16);
283283
}
284284

285-
// scenario where there is some space left in the first buffer, but not enough to accomodate
285+
// Scenario where there is some space left in the first buffer, but not enough to accomodate
286286
// a larger tensor, so a second buffer is required
287287
static void test_max_size_tensor_too_large() {
288288
dummy_backend backend = dummy_backend_init(32);
@@ -301,7 +301,25 @@ static void test_max_size_tensor_too_large() {
301301
GGML_ASSERT(backend.context->allocated_total() <= 32 + 24);
302302
}
303303

304-
// check that views don't require any extra memory
304+
// Scenario where a single tensor exceeds the max buffer size - in this case the allocator
305+
// should try to create a bigger buffer anyway, and wait for the backend to throw an error.
306+
// Backends may report an artificially lower max size in some cases for compatibility reasons.
307+
static void test_tensor_larger_than_max_size() {
308+
dummy_backend backend = dummy_backend_init(16);
309+
auto [ctx, graph, ctx_ptr] = make_context();
310+
311+
ggml_tensor * x[2];
312+
x[0] = make_input_with_size(ctx, 24);
313+
x[1] = ggml_scale(ctx, x[0], 2.0f);
314+
assign_names(ctx);
315+
316+
ggml_gallocr_ptr galloc = allocate_graph(graph, x[1], &backend.buffer_type);
317+
check_all_allocated(graph);
318+
check_no_overlap(graph);
319+
GGML_ASSERT(backend.context->allocated_total() == 24);
320+
}
321+
322+
// Check that views don't require any extra memory
305323
static void test_view_inplace() {
306324
dummy_backend backend = dummy_backend_init(32);
307325
auto [ctx, graph, ctx_ptr] = make_context();
@@ -323,7 +341,7 @@ static void test_view_inplace() {
323341
}
324342

325343
static void test_reuse_and_free() {
326-
dummy_backend backend = dummy_backend_init(32);
344+
dummy_backend backend = dummy_backend_init(40);
327345
auto [ctx, graph, ctx_ptr] = make_context();
328346

329347
ggml_tensor * x[9];
@@ -342,7 +360,7 @@ static void test_reuse_and_free() {
342360
check_all_allocated(graph);
343361
check_no_overlap(graph);
344362
check_max_size(ctx);
345-
GGML_ASSERT(backend.context->allocated_total() <= 32 + 32 + 32);
363+
GGML_ASSERT(backend.context->allocated_total() <= 40 + 32 + 32);
346364
}
347365

348366
static void test_merge_free_block(size_t max_buffer_size) {
@@ -455,8 +473,9 @@ static void test_buffer_size_zero() {
455473

456474
ggml_backend_buffer_type_t bufts[2] = { &backend_a.buffer_type, &backend_b.buffer_type };
457475
ggml_gallocr_ptr galloc = ggml_gallocr_ptr(ggml_gallocr_new_n(bufts, 2));
458-
ggml_gallocr_reserve_n(galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
459-
ggml_gallocr_alloc_graph(galloc.get(), graph);
476+
bool res1 = ggml_gallocr_reserve_n(galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
477+
bool res2 = ggml_gallocr_alloc_graph(galloc.get(), graph);
478+
GGML_ASSERT(res1 && res2);
460479

461480
check_all_allocated(graph);
462481
GGML_ASSERT(backend_a.context->allocated_total() == 16);
@@ -473,6 +492,7 @@ static void run(const char * name, void (*f)()) {
473492
int main() {
474493
run("test_max_size_too_many_tensors", test_max_size_too_many_tensors);
475494
run("test_max_size_tensor_too_large", test_max_size_tensor_too_large);
495+
run("test_tensor_larger_than_max_size", test_tensor_larger_than_max_size);
476496
run("test_view_inplace", test_view_inplace);
477497
run("test_reuse_and_free", test_reuse_and_free);
478498
run("test_merge_free_block(32)", []() { test_merge_free_block(32); });

0 commit comments

Comments
 (0)