track size of individual chunks in ggml_dyn_tallocr and raise max chunks.

Acly · Acly · commit 44d3ee4a9c7d · 2025-09-13T20:47:06.000+02:00
revert to use suballocation_block_size as max chunk size for vulkan.
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
@@ -95,6 +95,8 @@ enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_te
 
 // dynamic tensor allocator
 
+#define GGML_VBUFFER_MAX_CHUNKS 16
+
 struct free_block {
     size_t offset;
     size_t size;
@@ -103,8 +105,9 @@ struct free_block {
 struct ggml_dyn_tallocr {
     size_t alignment;
     int n_free_blocks;
+    int n_chunks;
     struct free_block free_blocks[MAX_FREE_BLOCKS];
-    size_t max_size;
+    size_t max_size[GGML_VBUFFER_MAX_CHUNKS];
     size_t max_chunk_size;
 
 #ifdef GGML_ALLOCATOR_DEBUG
@@ -117,10 +120,21 @@ struct ggml_dyn_tallocr {
 
 // the memory range [0, max_size) is divided into n chunks of size max_chunk_size (with the last chunk possibly being smaller).
 // tensor allocations may not cross chunk boundaries.
-static void ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, struct free_block * block) {
-    size_t n_chunks = (alloc->max_size + alloc->max_chunk_size - 1) / alloc->max_chunk_size;
-    block->offset = n_chunks * alloc->max_chunk_size;
-    block->size = alloc->max_chunk_size;
+static size_t ggml_dyn_tallocr_chunk_index(struct ggml_dyn_tallocr * alloc, size_t offset) {
+    for (int i = 0; i < alloc->n_chunks; i++) {
+        if (offset < alloc->max_size[i]) {
+            return i;
+        }
+    }
+    return alloc->n_chunks - 1;
+}
+
+static void ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, struct free_block * block, size_t min_size) {
+    GGML_ASSERT(alloc->n_chunks >= 1);
+    block->offset = alloc->max_size[alloc->n_chunks - 1];
+    block->size = MAX(min_size, alloc->max_chunk_size);
+    alloc->n_chunks++;
+    GGML_ASSERT(alloc->n_chunks <= GGML_VBUFFER_MAX_CHUNKS);
 }
 
 #ifdef GGML_ALLOCATOR_DEBUG
@@ -149,10 +163,6 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
     size = aligned_offset(NULL, size, alloc->alignment);
 
     AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
-    if (size > alloc->max_chunk_size) {
-        GGML_ABORT("allocation failed: tensor %s (%zu bytes) exceeds maximum backend buffer size (%zu bytes)\n",
-            tensor->name, size, alloc->max_chunk_size);
-    }
 
     size_t max_avail = 0;
 
@@ -172,14 +182,10 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
         // the last block represents memory still available in an existing chunk
         struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
         max_avail = MAX(max_avail, block->size);
-        if (block->size >= size) {
-            best_fit_block = alloc->n_free_blocks - 1;
-        } else {
-            // not enough space in existing chunk, create a new one at the end
-            best_fit_block = alloc->n_free_blocks;
-            alloc->n_free_blocks += 1;
-            GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
-            ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[alloc->n_free_blocks - 1]);
+        best_fit_block = alloc->n_free_blocks - 1;
+        if (block->size < size) {
+            // not enough space in existing chunk, start the next one
+            ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[best_fit_block], size);
         }
     }
 
@@ -196,7 +202,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
         // if there are no remaining blocks all memory in current chunk was used up -> start the next one
         if (alloc->n_free_blocks == 0) {
             alloc->n_free_blocks = 1;
-            ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[0]);
+            ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[0], 0);
         }
     }
 
@@ -232,7 +238,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
     }
 #endif
 
-    alloc->max_size = MAX(alloc->max_size, offset + size);
+    alloc->max_size[alloc->n_chunks-1] = MAX(alloc->max_size[alloc->n_chunks-1], offset + size);
 
     return offset;
 
@@ -248,13 +254,13 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t
 #ifdef GGML_ALLOCATOR_DEBUG
     remove_allocated_tensor(alloc, offset, tensor);
 #endif
-    size_t chunk = offset / alloc->max_chunk_size;
+    size_t chunk = ggml_dyn_tallocr_chunk_index(alloc, offset);
 
     // see if we can merge with an existing block
     for (int i = 0; i < alloc->n_free_blocks; i++) {
         struct free_block * block = &alloc->free_blocks[i];
         // can only merge with blocks within the same chunk
-        size_t block_chunk = block->offset / alloc->max_chunk_size;
+        size_t block_chunk = ggml_dyn_tallocr_chunk_index(alloc, block->offset);
         if (chunk != block_chunk) {
             continue;
         }
@@ -264,7 +270,7 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t
             // check if we can merge with the next block (within the same chunk)
             if (i < alloc->n_free_blocks - 1) {
                 struct free_block * next = &alloc->free_blocks[i+1];
-                if (block->offset + block->size == next->offset && block_chunk == (next->offset / alloc->max_chunk_size)) {
+                if (block->offset + block->size == next->offset && block_chunk == ggml_dyn_tallocr_chunk_index(alloc, next->offset)) {
                     block->size += next->size;
                     alloc->n_free_blocks--;
                     for (int j = i+1; j < alloc->n_free_blocks; j++) {
@@ -281,7 +287,7 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t
             // check if we can merge with the previous block (within the same chunk)
             if (i > 0) {
                 struct free_block * prev = &alloc->free_blocks[i-1];
-                if (prev->offset + prev->size == block->offset && block_chunk == (prev->offset / alloc->max_chunk_size)) {
+                if (prev->offset + prev->size == block->offset && block_chunk == ggml_dyn_tallocr_chunk_index(alloc, prev->offset)) {
                     prev->size += block->size;
                     alloc->n_free_blocks--;
                     for (int j = i; j < alloc->n_free_blocks; j++) {
@@ -313,9 +319,10 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t
 
 static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
     alloc->n_free_blocks = 1;
+    alloc->n_chunks = 1;
     alloc->free_blocks[0].offset = 0;
     alloc->free_blocks[0].size = alloc->max_chunk_size;
-    alloc->max_size = 0;
+    memset(alloc->max_size, 0, sizeof(alloc->max_size));
 
     if (alloc->free_blocks[0].size == SIZE_MAX) {
         alloc->free_blocks[0].size = SIZE_MAX/2; // avoid overflows
@@ -334,8 +341,9 @@ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t m
     *alloc = (struct ggml_dyn_tallocr) {
         /*.alignment       = */ alignment,
         /*.n_free_blocks   = */ 0,
+        /*.n_chunks        = */ 0,
         /*.free_blocks     = */ {{0}},
-        /*.max_size        = */ 0,
+        /*.max_size        = */ {0},
         /*.max_chunk_size  = */ max_buffer_size,
 #ifdef GGML_ALLOCATOR_DEBUG
         /*.allocated_tensors = */ {{0}},
@@ -352,14 +360,12 @@ static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
 }
 
 static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
-    return alloc->max_size;
+    return alloc->max_size[alloc->n_chunks - 1];
 }
 
 
 // virtual buffer with contiguous memory range, split into multiple backend buffers (chunks)
 
-#define GGML_VBUFFER_MAX_CHUNKS 8
-
 struct vbuffer {
     ggml_backend_buffer_type_t buft;
     ggml_backend_buffer_t chunks[GGML_VBUFFER_MAX_CHUNKS];
@@ -401,36 +407,32 @@ static size_t ggml_vbuffer_size(struct vbuffer * buf) {
     return size;
 }
 
-static int ggml_vbuffer_alloc(struct vbuffer * buf, size_t size, enum ggml_backend_buffer_usage usage) {
-    size_t max_chunk_size = ggml_backend_buft_get_max_size(buf->buft);
-    if (size > GGML_VBUFFER_MAX_CHUNKS * max_chunk_size) {
-        return 0;
-    }
-
-    int n = 0;
-    // always allocate at least 1 chunk even if requested size is 0
-    while (size > 0 || n == 0) {
-        GGML_ASSERT(n < GGML_VBUFFER_MAX_CHUNKS);
-        size_t chunk_size = MIN(size, max_chunk_size);
+static bool ggml_vbuffer_alloc(struct vbuffer * buf, const struct ggml_dyn_tallocr * talloc, enum ggml_backend_buffer_usage usage) {
+    for (int n = 0; n < talloc->n_chunks; n++) {
+        size_t chunk_size = talloc->max_size[n];
+        if (n > 0) {
+            chunk_size -= talloc->max_size[n - 1];
+        }
         buf->chunks[n] = ggml_backend_buft_alloc_buffer(buf->buft, chunk_size);
         if (buf->chunks[n] == NULL) {
             ggml_vbuffer_free_chunks(buf);
-            return 0;
+            return false;
         }
         ggml_backend_buffer_set_usage(buf->chunks[n], usage);
-
-        GGML_ASSERT(size >= chunk_size);
-        size -= chunk_size;
-        n += 1;
     }
-    return n;
+    return true;
 }
 
 static void ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct ggml_tensor * tensor, size_t offset) {
-    size_t max_chunk_size = ggml_backend_buft_get_max_size(buf->buft);
-    size_t chunk_index = offset / max_chunk_size;
-    size_t chunk_offset = offset % max_chunk_size;
-    GGML_ASSERT(chunk_index < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[chunk_index] != NULL);
+    size_t chunk_index = 0, chunk_offset = offset;
+    while (chunk_index < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[chunk_index]) {
+        size_t chunk_size = ggml_backend_buffer_get_size(buf->chunks[chunk_index]);
+        if (chunk_offset < chunk_size) {
+            break;
+        }
+        chunk_offset -= chunk_size;
+        chunk_index++;
+    }
 
     void * base = ggml_backend_buffer_get_base(buf->chunks[chunk_index]);
     void * addr = (char *)base + chunk_offset;
@@ -880,7 +882,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
 #endif
 
             ggml_vbuffer_free_chunks(galloc->buffers[i]);
-            if (!ggml_vbuffer_alloc(galloc->buffers[i], new_size, GGML_BACKEND_BUFFER_USAGE_COMPUTE)) {
+            if (!ggml_vbuffer_alloc(galloc->buffers[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE)) {
                 GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
                 return false;
             }
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -11129,7 +11129,7 @@ static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type
 
 static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
     ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
-    return ctx->device->max_memory_allocation_size;
+    return ctx->device->suballocation_block_size;
 }
 
 static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
diff --git a/tests/test-alloc.cpp b/tests/test-alloc.cpp
@@ -259,7 +259,7 @@ static void check_no_overlap(ggml_cgraph * graph) {
 //
 // test cases
 
-// scenario where the first backend buffer is completely exhausted and there are further
+// Scenario where the first backend buffer is completely exhausted and there are further
 // tensors which require a second buffer
 static void test_max_size_too_many_tensors() {
     dummy_backend backend      = dummy_backend_init(16);
@@ -282,7 +282,7 @@ static void test_max_size_too_many_tensors() {
     GGML_ASSERT(backend.context->allocated_total() <= 16 + 16);
 }
 
-// scenario where there is some space left in the first buffer, but not enough to accomodate
+// Scenario where there is some space left in the first buffer, but not enough to accomodate
 // a larger tensor, so a second buffer is required
 static void test_max_size_tensor_too_large() {
     dummy_backend backend      = dummy_backend_init(32);
@@ -301,7 +301,25 @@ static void test_max_size_tensor_too_large() {
     GGML_ASSERT(backend.context->allocated_total() <= 32 + 24);
 }
 
-// check that views don't require any extra memory
+// Scenario where a single tensor exceeds the max buffer size - in this case the allocator
+// should try to create a bigger buffer anyway, and wait for the backend to throw an error.
+// Backends may report an artificially lower max size in some cases for compatibility reasons.
+static void test_tensor_larger_than_max_size() {
+    dummy_backend backend      = dummy_backend_init(16);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[2];
+    x[0] = make_input_with_size(ctx, 24);
+    x[1] = ggml_scale(ctx, x[0], 2.0f);
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[1], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    GGML_ASSERT(backend.context->allocated_total() == 24);
+}
+
+// Check that views don't require any extra memory
 static void test_view_inplace() {
     dummy_backend backend      = dummy_backend_init(32);
     auto [ctx, graph, ctx_ptr] = make_context();
@@ -323,7 +341,7 @@ static void test_view_inplace() {
 }
 
 static void test_reuse_and_free() {
-    dummy_backend backend      = dummy_backend_init(32);
+    dummy_backend backend      = dummy_backend_init(40);
     auto [ctx, graph, ctx_ptr] = make_context();
 
     ggml_tensor * x[9];
@@ -342,7 +360,7 @@ static void test_reuse_and_free() {
     check_all_allocated(graph);
     check_no_overlap(graph);
     check_max_size(ctx);
-    GGML_ASSERT(backend.context->allocated_total() <= 32 + 32 + 32);
+    GGML_ASSERT(backend.context->allocated_total() <= 40 + 32 + 32);
 }
 
 static void test_merge_free_block(size_t max_buffer_size) {
@@ -455,8 +473,9 @@ static void test_buffer_size_zero() {
 
     ggml_backend_buffer_type_t bufts[2] = { &backend_a.buffer_type, &backend_b.buffer_type };
     ggml_gallocr_ptr           galloc   = ggml_gallocr_ptr(ggml_gallocr_new_n(bufts, 2));
-    ggml_gallocr_reserve_n(galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
-    ggml_gallocr_alloc_graph(galloc.get(), graph);
+    bool res1 = ggml_gallocr_reserve_n(galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
+    bool res2 = ggml_gallocr_alloc_graph(galloc.get(), graph);
+    GGML_ASSERT(res1 && res2);
 
     check_all_allocated(graph);
     GGML_ASSERT(backend_a.context->allocated_total() == 16);
@@ -473,6 +492,7 @@ static void run(const char * name, void (*f)()) {
 int main() {
     run("test_max_size_too_many_tensors", test_max_size_too_many_tensors);
     run("test_max_size_tensor_too_large", test_max_size_tensor_too_large);
+    run("test_tensor_larger_than_max_size", test_tensor_larger_than_max_size);
     run("test_view_inplace", test_view_inplace);
     run("test_reuse_and_free", test_reuse_and_free);
     run("test_merge_free_block(32)", []() { test_merge_free_block(32); });

Original file line number	Diff line number	Diff line change
`@@ -11129,7 +11129,7 @@ static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type`
`11129`	`11129`
`11130`	`11130`	`static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {`
`11131`	`11131`	`ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;`
`11132`		`- return ctx->device->max_memory_allocation_size;`
	`11132`	`+ return ctx->device->suballocation_block_size;`
`11133`	`11133`	`}`
`11134`	`11134`
`11135`	`11135`	`static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {`