From 34b473ce317a02721b0d5aa13c2ec3a60c5c3cb4 Mon Sep 17 00:00:00 2001 From: Xiangyan Sun Date: Sun, 12 Oct 2025 17:28:52 -0700 Subject: [PATCH 1/6] Implement and use cuda graph plans. --- ggml/include/ggml-backend.h | 1 + ggml/src/ggml-backend.cpp | 112 +++++++++- ggml/src/ggml-cuda/common.cuh | 10 +- ggml/src/ggml-cuda/cpy.cu | 16 +- ggml/src/ggml-cuda/cpy.cuh | 4 +- ggml/src/ggml-cuda/ggml-cuda.cu | 373 +++++++++++++++++--------------- ggml/src/ggml-cuda/mean.cu | 12 +- ggml/src/ggml-cuda/mean.cuh | 2 +- 8 files changed, 333 insertions(+), 197 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index f1b740785914e..c350ae205f15d 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -95,6 +95,7 @@ extern "C" { GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + GGML_API void ggml_backend_graph_plan_update(ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph); GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan); GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index ff9135fe2d878..dd4007d89a3e7 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -327,6 +327,18 @@ void ggml_backend_synchronize(ggml_backend_t backend) { backend->iface.synchronize(backend); } +bool ggml_backend_supports_graph_plan(ggml_backend_t backend) { + GGML_ASSERT(backend); + + return (bool) backend->iface.graph_plan_create; +} + +bool ggml_backend_supports_graph_plan_update(ggml_backend_t backend) { + GGML_ASSERT(backend); + + return (bool) backend->iface.graph_plan_update; +} + ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { GGML_ASSERT(backend); GGML_ASSERT(backend->iface.graph_plan_create != NULL); @@ -341,6 +353,13 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla backend->iface.graph_plan_free(backend, plan); } +void ggml_backend_graph_plan_update(ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph* cgraph) { + GGML_ASSERT(backend); + GGML_ASSERT(backend->iface.graph_plan_update != NULL); + + backend->iface.graph_plan_update(backend, plan, cgraph); +} + enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { GGML_ASSERT(backend); GGML_ASSERT(backend->iface.graph_plan_compute != NULL); @@ -675,6 +694,11 @@ struct ggml_backend_sched_split { struct ggml_cgraph graph; }; +struct ggml_backend_sched_plan { + int backend_id; + ggml_backend_graph_plan_t plan; +}; + struct ggml_backend_sched { bool is_reset; // true if the scheduler has been reset since the last graph split bool is_alloc; @@ -704,6 +728,12 @@ struct ggml_backend_sched { int n_splits; int splits_capacity; + // graph plans + struct ggml_backend_sched_plan * plans; + int n_plans; + int plans_capacity; + bool plan_dirty; + // pipeline parallelism support int n_copies; int cur_copy; @@ -908,6 +938,16 @@ static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, stru } } +static void ggml_backend_sched_free_plans(ggml_backend_sched_t sched) { + for (int i = 0; i < sched->n_plans; i++) { + ggml_backend_t backend = sched->backends[sched->plans[i].backend_id]; + if (ggml_backend_supports_graph_plan(backend)) { + ggml_backend_graph_plan_free(backend, sched->plans[i].plan); + } + } + sched->n_plans = 0; +} + // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { // reset splits @@ -1372,6 +1412,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra assert(graph_copy->size > graph_copy->n_leafs); graph_copy->leafs[graph_copy->n_leafs++] = leaf; } + sched->plan_dirty = true; } static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { @@ -1413,6 +1454,62 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { return true; } +static void ggml_backend_sched_update_plans(ggml_backend_sched_t sched) { + // create graph plans + if (sched->plan_dirty) { + bool create_new_plans; + if (sched->n_plans == sched->n_splits) { + create_new_plans = false; + for (int i = 0; i < sched->n_splits; i++) { + if (sched->splits[i].backend_id != sched->plans[i].backend_id) { + create_new_plans = true; + break; + } + } + } else { + create_new_plans = true; + } + if (create_new_plans) { + // free previous and recreate new plans + ggml_backend_sched_free_plans(sched); + if (sched->plans_capacity < sched->n_splits) { + while (sched->plans_capacity < sched->n_splits) { + sched->plans_capacity *= 2; + } + sched->plans = (ggml_backend_sched_plan *) realloc( + sched->plans, sched->plans_capacity * sizeof(struct ggml_backend_sched_plan)); + GGML_ASSERT(sched->plans); + } + sched->n_plans = sched->n_splits; + for (int i = 0; i < sched->n_splits; i++) { + ggml_backend_t backend = sched->backends[sched->splits[i].backend_id]; + sched->plans[i].backend_id = sched->splits[i].backend_id; + if (ggml_backend_supports_graph_plan(backend)) { + sched->plans[i].plan = ggml_backend_graph_plan_create(backend, &sched->splits[i].graph); + } else { + sched->plans[i].plan = nullptr; + } + } + } else { + // update existing plans + for (int i = 0; i < sched->n_splits; i++) { + ggml_backend_t backend = sched->backends[sched->splits[i].backend_id]; + if (ggml_backend_supports_graph_plan(backend)) { + if (ggml_backend_supports_graph_plan_update(backend)) { + ggml_backend_graph_plan_update(backend, sched->plans[i].plan, &sched->splits[i].graph); + } else { + ggml_backend_graph_plan_free(backend, sched->plans[i].plan); + sched->plans[i].plan = ggml_backend_graph_plan_create(backend, &sched->splits[i].graph); + } + } else { + sched->plans[i].plan = nullptr; + } + } + } + sched->plan_dirty = false; + } +} + static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { GGML_ASSERT(sched); struct ggml_backend_sched_split * splits = sched->splits; @@ -1421,6 +1518,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s std::vector ids; std::vector used_ids; + ggml_backend_sched_update_plans(sched); + for (int split_id = 0; split_id < sched->n_splits; split_id++) { struct ggml_backend_sched_split * split = &splits[split_id]; int split_backend_id = split->backend_id; @@ -1550,7 +1649,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } if (!sched->callback_eval) { - enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); + enum ggml_status ec; + if (ggml_backend_supports_graph_plan(split_backend) && sched->plans[split_id].plan) { + ec = ggml_backend_graph_plan_compute(split_backend, sched->plans[split_id].plan); + } else { + ec = ggml_backend_graph_compute_async(split_backend, &split->graph); + } if (ec != GGML_STATUS_SUCCESS) { return ec; } @@ -1637,6 +1741,10 @@ ggml_backend_sched_t ggml_backend_sched_new( sched->splits = (ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0])); sched->splits_capacity = initial_splits_capacity; + const int initial_plans_capacity = 16; + sched->plans = (ggml_backend_sched_plan *) calloc(initial_plans_capacity, sizeof(sched->plans[0])); + sched->plans_capacity = initial_plans_capacity; + for (int b = 0; b < n_backends; b++) { sched->backends[b] = backends[b]; sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]); @@ -1670,6 +1778,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { ggml_free(sched->ctx); ggml_hash_set_free(&sched->hash_set); free(sched->splits); + ggml_backend_sched_free_plans(sched); + free(sched->plans); free(sched->hv_tensor_backend_ids); free(sched->hv_tensor_copies); free(sched->node_backend_ids); diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index e0abde5427c83..f6ec263febd04 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -936,13 +936,10 @@ struct ggml_cuda_graph { } cudaGraph_t graph = nullptr; cudaGraphExec_t instance = nullptr; + const ggml_cgraph * cgraph; size_t num_nodes = 0; std::vector nodes; std::vector params; - bool disable_due_to_gpu_arch = false; - bool disable_due_to_too_many_updates = false; - bool disable_due_to_failed_graph_capture = false; - int number_consecutive_updates = 0; std::vector ggml_graph_properties; bool use_cpy_indirection = false; std::vector cpy_dest_ptrs; @@ -962,7 +959,10 @@ struct ggml_backend_cuda_context { cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } }; cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; - std::unique_ptr cuda_graph; +#ifdef USE_CUDA_GRAPH + bool cuda_graph_initialized = false; + bool disable_due_to_gpu_arch = false; +#endif explicit ggml_backend_cuda_context(int device) : device(device), diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 746f43966b84c..afea5ff8c520b 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -277,7 +277,7 @@ static void ggml_cpy_f32_iq4_nl_cuda( (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++); } -void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) { +void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, ggml_cuda_graph * cuda_graph, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) { const int64_t ne = ggml_nelements(src0); GGML_ASSERT(ne == ggml_nelements(src1)); @@ -314,9 +314,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg char ** dest_ptrs_d = nullptr; int graph_cpynode_index = -1; #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS) - if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) { - dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d; - graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index; + if (cuda_graph && cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) { + dest_ptrs_d = cuda_graph->dest_ptrs_d; + graph_cpynode_index = cuda_graph->graph_cpynode_index; } #else GGML_UNUSED(disable_indirection_for_this_node); @@ -387,8 +387,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg ggml_type_name(src0->type), ggml_type_name(src1->type)); } #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS) - if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) { - ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index; + if (cuda_graph && cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) { + cuda_graph->graph_cpynode_index = graph_cpynode_index; } #else GGML_UNUSED(disable_indirection_for_this_node); @@ -396,10 +396,10 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg } -void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { +void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_cuda_graph * cuda_graph, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; bool disable_indirection = true; - ggml_cuda_cpy(ctx, src0, dst, disable_indirection); + ggml_cuda_cpy(ctx, cuda_graph, src0, dst, disable_indirection); } void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { diff --git a/ggml/src/ggml-cuda/cpy.cuh b/ggml/src/ggml-cuda/cpy.cuh index 0bd3c0c6f8c27..8b2774665f772 100644 --- a/ggml/src/ggml-cuda/cpy.cuh +++ b/ggml/src/ggml-cuda/cpy.cuh @@ -2,9 +2,9 @@ #define CUDA_CPY_BLOCK_SIZE 64 -void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection = false); +void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, ggml_cuda_graph * cuda_graph, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection = false); -void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_cuda_graph * cuda_graph, ggml_tensor * dst); void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 856e9de2e1115..099763f5ea053 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2231,7 +2231,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * nb1, nb2, nb3, stream); } -static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) { +static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, ggml_cuda_graph * cuda_graph, struct ggml_tensor * dst) { // why is this here instead of mul_mat? if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) { ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device); @@ -2260,13 +2260,13 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg ggml_cuda_op_set_rows(ctx, dst); break; case GGML_OP_DUP: - ggml_cuda_dup(ctx, dst); + ggml_cuda_dup(ctx, cuda_graph, dst); break; case GGML_OP_CPY: - ggml_cuda_cpy(ctx, dst->src[0], dst->src[1]); + ggml_cuda_cpy(ctx, cuda_graph, dst->src[0], dst->src[1]); break; case GGML_OP_CONT: - ggml_cuda_dup(ctx, dst); + ggml_cuda_dup(ctx, cuda_graph, dst); break; case GGML_OP_ADD: case GGML_OP_ADD1: // TODO: more efficient implementation @@ -2486,7 +2486,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg ggml_cuda_op_sum_rows(ctx, dst); break; case GGML_OP_MEAN: - ggml_cuda_op_mean(ctx, dst); + ggml_cuda_op_mean(ctx, cuda_graph, dst); break; case GGML_OP_SSM_CONV: ggml_cuda_op_ssm_conv(ctx, dst); @@ -2633,11 +2633,11 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { } #ifdef USE_CUDA_GRAPH -static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, +static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_cuda_graph * cuda_graph, const ggml_cgraph * cgraph, cudaStream_t stream, bool use_cuda_graph) { // Loop over nodes in GGML graph to obtain info needed for CUDA graph - cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); + cuda_graph->cpy_dest_ptrs.clear(); const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected"; const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj"; @@ -2692,7 +2692,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud // Store the pointers which are updated for each token, such that these can be sent // to the device and accessed using indirection from CUDA graph - cuda_ctx->cuda_graph->cpy_dest_ptrs.push_back((char *) node->src[1]->data); + cuda_graph->cpy_dest_ptrs.push_back((char *) node->src[1]->data); // store a pointer to each copy op CUDA kernel to identify it later void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]); @@ -2710,9 +2710,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud } if (use_cuda_graph) { - cuda_ctx->cuda_graph->use_cpy_indirection = true; + cuda_graph->use_cpy_indirection = true; // copy pointers to GPU so they can be accessed via indirection within CUDA graph - ggml_cuda_cpy_dest_ptrs_copy(cuda_ctx->cuda_graph.get(), cuda_ctx->cuda_graph->cpy_dest_ptrs.data(), cuda_ctx->cuda_graph->cpy_dest_ptrs.size(), cuda_ctx->stream()); + ggml_cuda_cpy_dest_ptrs_copy(cuda_graph, cuda_graph->cpy_dest_ptrs.data(), cuda_graph->cpy_dest_ptrs.size(), stream); } return use_cuda_graph; @@ -2769,18 +2769,18 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra return true; } -static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) { +static bool is_cuda_graph_update_required(ggml_cuda_graph * cuda_graph, const ggml_cgraph * cgraph) { bool cuda_graph_update_required = false; - if (cuda_ctx->cuda_graph->instance == nullptr) { + if (cuda_graph->instance == nullptr) { cuda_graph_update_required = true; } // Check if the graph size has changed - if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) { + if (cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) { cuda_graph_update_required = true; - cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes); + cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes); } // Loop over nodes in GGML graph to determine if CUDA graph update is required @@ -2788,26 +2788,26 @@ static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, for (int i = 0; i < cgraph->n_nodes; i++) { bool has_matching_properties = true; if (!cuda_graph_update_required) { - has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]); + has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_graph->ggml_graph_properties[i]); } if (!has_matching_properties) { cuda_graph_update_required = true; } - set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]); + set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_graph->ggml_graph_properties[i]); } return cuda_graph_update_required; } -static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) { +static void update_cuda_graph_executable(ggml_cuda_graph * cuda_graph) { #if CUDART_VERSION >= 12000 cudaGraphExecUpdateResultInfo result_info; - cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info); + cudaError_t stat = cudaGraphExecUpdate(cuda_graph->instance, cuda_graph->graph, &result_info); #else cudaGraphNode_t errorNode; cudaGraphExecUpdateResult result_info; - cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info); + cudaError_t stat = cudaGraphExecUpdate(cuda_graph->instance, cuda_graph->graph, &errorNode, &result_info); #endif // CUDART_VERSION >= 12000 if (stat == cudaErrorGraphExecUpdateFailure) { @@ -2818,9 +2818,9 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) { // The pre-existing graph exec cannot be updated due to violated constraints // so instead clear error and re-instantiate (void)cudaGetLastError(); - CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance)); - cuda_ctx->cuda_graph->instance = nullptr; - CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0)); + CUDA_CHECK(cudaGraphExecDestroy(cuda_graph->instance)); + cuda_graph->instance = nullptr; + CUDA_CHECK(cudaGraphInstantiate(&cuda_graph->instance, cuda_graph->graph, NULL, NULL, 0)); } else { GGML_ASSERT(stat == cudaSuccess); } @@ -2941,224 +2941,244 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, return false; } -static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, - bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) { +static void evaluate_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cuda_graph * cuda_graph, const ggml_cgraph * cgraph) { // flag used to determine whether it is an integrated_gpu const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated; - while (!graph_evaluated_or_captured) { - // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph. - // With the use of CUDA graphs, the execution will be performed by the graph launch. - if (!use_cuda_graph || cuda_graph_update_required) { - - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor * node = cgraph->nodes[i]; - - if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { - continue; - } + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; - static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr); - if (!disable_fusion) { + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + continue; + } - if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ true), {})) { - ggml_tensor * weights = cgraph->nodes[i+8]; - ggml_tensor * selected_experts = cgraph->nodes[i+3]; - ggml_cuda_op_topk_moe(*cuda_ctx, node, weights, selected_experts, /*with norm*/ true); - i += 8; - continue; - } + static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr); + if (!disable_fusion) { - if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ false), {})) { - ggml_tensor * weights = cgraph->nodes[i+4]; - ggml_tensor * selected_experts = cgraph->nodes[i+3]; - ggml_cuda_op_topk_moe(*cuda_ctx, node, weights, selected_experts, /*with norm*/ false); - i += 4; - continue; - } + if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ true), {})) { + ggml_tensor * weights = cgraph->nodes[i+8]; + ggml_tensor * selected_experts = cgraph->nodes[i+3]; + ggml_cuda_op_topk_moe(*cuda_ctx, node, weights, selected_experts, /*with norm*/ true); + i += 8; + continue; + } - if (node->op == GGML_OP_ADD) { - int n_fuse = 0; - ggml_op ops[8]; - std::fill(ops, ops + 8, GGML_OP_ADD); + if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ false), {})) { + ggml_tensor * weights = cgraph->nodes[i+4]; + ggml_tensor * selected_experts = cgraph->nodes[i+3]; + ggml_cuda_op_topk_moe(*cuda_ctx, node, weights, selected_experts, /*with norm*/ false); + i += 4; + continue; + } - for (; n_fuse <= 6; ++n_fuse){ - if (!ggml_can_fuse(cgraph, i + n_fuse, ops + n_fuse, 2)) { - break; - } - if (cgraph->nodes[i + n_fuse] != cgraph->nodes[i + n_fuse + 1]->src[0]) { - break; - } - if (!ggml_are_same_layout(cgraph->nodes[i + n_fuse]->src[1], cgraph->nodes[i + n_fuse + 1]->src[1])) { - break; - } - } + if (node->op == GGML_OP_ADD) { + int n_fuse = 0; + ggml_op ops[8]; + std::fill(ops, ops + 8, GGML_OP_ADD); - n_fuse++; + for (; n_fuse <= 6; ++n_fuse){ + if (!ggml_can_fuse(cgraph, i + n_fuse, ops + n_fuse, 2)) { + break; + } + if (cgraph->nodes[i + n_fuse] != cgraph->nodes[i + n_fuse + 1]->src[0]) { + break; + } + if (!ggml_are_same_layout(cgraph->nodes[i + n_fuse]->src[1], cgraph->nodes[i + n_fuse + 1]->src[1])) { + break; + } + } - if (n_fuse > 1) { - for (int j = 0; j < n_fuse - 1; ++j) { - node->src[j + 2] = cgraph->nodes[i + j + 1]->src[1]; - } - cgraph->nodes[i + n_fuse - 1]->data = node->data; - ggml_cuda_op_fused_add(*cuda_ctx, node, n_fuse); - i += n_fuse - 1; + n_fuse++; - continue; - } + if (n_fuse > 1) { + for (int j = 0; j < n_fuse - 1; ++j) { + node->src[j + 2] = cgraph->nodes[i + j + 1]->src[1]; } + cgraph->nodes[i + n_fuse - 1]->data = node->data; + ggml_cuda_op_fused_add(*cuda_ctx, node, n_fuse); + i += n_fuse - 1; + continue; + } + } - if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD}, {})) { - ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]); - i += 2; - continue; - } - if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL}, {})) { - ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]); - i++; - continue; - } + if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD}, {})) { + ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]); + i += 2; + continue; + } - if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) { - i += 2; - ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node); - continue; - } - } + if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL}, {})) { + ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]); + i++; + continue; + } + + if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) { + i += 2; + ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node); + continue; + } + } #ifndef NDEBUG - assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); - for (int j = 0; j < GGML_MAX_SRC; j++) { - if (node->src[j] != nullptr) { - assert(node->src[j]->buffer); - assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || - ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft))); - } - } + assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (node->src[j] != nullptr) { + assert(node->src[j]->buffer); + assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || + ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft))); + } + } #else - GGML_UNUSED(integrated); + GGML_UNUSED(integrated); #endif // NDEBUG - bool ok = ggml_cuda_compute_forward(*cuda_ctx, node); - if (!ok) { - GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); - } - GGML_ASSERT(ok); - } + bool ok = ggml_cuda_compute_forward(*cuda_ctx, cuda_graph, node); + if (!ok) { + GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } + GGML_ASSERT(ok); + } +} #ifdef USE_CUDA_GRAPH - if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture - if (cuda_ctx->cuda_graph->graph != nullptr) { - CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph)); - cuda_ctx->cuda_graph->graph = nullptr; - } +static void capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cuda_graph * cuda_graph, const ggml_cgraph * cgraph) { + // Start CUDA graph capture + { + std::lock_guard lock(ggml_cuda_lock); + ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed); + } - CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph)); - graph_evaluated_or_captured = true; // CUDA graph has been captured + CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed)); - std::lock_guard lock(ggml_cuda_lock); - if (ggml_cuda_lock_counter.fetch_sub(1, std::memory_order_relaxed) == 1) { - ggml_cuda_lock_cv.notify_all(); - } - } else { - graph_evaluated_or_captured = true; // ggml graph has been directly evaluated - } + evaluate_cuda_graph(cuda_ctx, cuda_graph, cgraph); + + if (cuda_graph->graph != nullptr) { + CUDA_CHECK(cudaGraphDestroy(cuda_graph->graph)); + cuda_graph->graph = nullptr; } - if (use_cuda_graph) { - if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph. - CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0)); - } - if (cuda_graph_update_required) { // Update graph executable - update_cuda_graph_executable(cuda_ctx); - } - // Launch graph - CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream())); -#else - graph_evaluated_or_captured = true; -#endif // USE_CUDA_GRAPH + CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_graph->graph)); + + if (cuda_graph->instance == nullptr) { + CUDA_CHECK(cudaGraphInstantiate(&cuda_graph->instance, cuda_graph->graph, NULL, NULL, 0)); + } else { + update_cuda_graph_executable(cuda_graph); + } + + std::lock_guard lock(ggml_cuda_lock); + if (ggml_cuda_lock_counter.fetch_sub(1, std::memory_order_relaxed) == 1) { + ggml_cuda_lock_cv.notify_all(); } } -static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph* cgraph) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_cuda_set_device(cuda_ctx->device); -#ifdef USE_CUDA_GRAPH static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr); - // Objects required for CUDA Graph - if (cuda_ctx->cuda_graph == nullptr) { - cuda_ctx->cuda_graph.reset(new ggml_cuda_graph()); - } + ggml_cuda_graph * cuda_graph = new ggml_cuda_graph(); + + cuda_graph->cgraph = cgraph; bool use_cuda_graph = true; - bool cuda_graph_update_required = false; - if (cuda_ctx->cuda_graph->graph == nullptr) { + if (!cuda_ctx->cuda_graph_initialized) { if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) { - cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true; + cuda_ctx->disable_due_to_gpu_arch = true; #ifndef NDEBUG GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__); #endif } + cuda_ctx->cuda_graph_initialized = true; } // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly, // or previous graph capture failure. // Also disable for multi-gpu for now. TO DO investigate if (disable_cuda_graphs_due_to_env - || cuda_ctx->cuda_graph->disable_due_to_gpu_arch - || cuda_ctx->cuda_graph->disable_due_to_too_many_updates - || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) { + || cuda_ctx->disable_due_to_gpu_arch) { use_cuda_graph = false; } if (use_cuda_graph) { - cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph); + use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_graph, cgraph, cuda_ctx->stream(), use_cuda_graph); + } + + if (use_cuda_graph) { + capture_cuda_graph(cuda_ctx, cuda_graph, cgraph); + } else { + cuda_graph->use_cpy_indirection = false; + } - use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph); + return cuda_graph; +} - // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates. - if (use_cuda_graph && cuda_graph_update_required) { - cuda_ctx->cuda_graph->number_consecutive_updates++; - } else { - cuda_ctx->cuda_graph->number_consecutive_updates = 0; - } +static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + delete (ggml_cuda_graph *) plan; - if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) { - cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true; -#ifndef NDEBUG - GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); -#endif - } + GGML_UNUSED(backend); +} + +static void ggml_backend_cuda_graph_plan_update(ggml_backend_t backend, ggml_backend_graph_plan_t plan, const ggml_cgraph* cgraph) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context; + + ggml_cuda_set_device(cuda_ctx->device); + + ggml_cuda_graph * cuda_graph = (ggml_cuda_graph *) plan; + + cuda_graph->cgraph = cgraph; + + if (!cuda_graph->graph) { + return; } - if (use_cuda_graph && cuda_graph_update_required) { - // Start CUDA graph capture - { - std::lock_guard lock(ggml_cuda_lock); - ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed); + bool use_cuda_graph = true; + + use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_graph, cgraph, cuda_ctx->stream(), use_cuda_graph); + + if (!use_cuda_graph) { + if (cuda_graph->instance != nullptr) { + CUDA_CHECK(cudaGraphExecDestroy(cuda_graph->instance)); + } + if (cuda_graph->graph != nullptr) { + CUDA_CHECK(cudaGraphDestroy(cuda_graph->graph)); } + cuda_graph->instance = nullptr; + cuda_graph->graph = nullptr; + cuda_graph->use_cpy_indirection = false; + } - CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed)); + if (is_cuda_graph_update_required(cuda_graph, cgraph)) { + capture_cuda_graph(cuda_ctx, cuda_graph, cgraph); } +} - if (!use_cuda_graph) { - cuda_ctx->cuda_graph->use_cpy_indirection = false; +static enum ggml_status ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context; + + ggml_cuda_graph * cuda_graph = (ggml_cuda_graph *) plan; + + ggml_cuda_set_device(cuda_ctx->device); + + if (cuda_graph->instance) { + CUDA_CHECK(cudaGraphLaunch(cuda_graph->instance, cuda_ctx->stream())); + } else { + evaluate_cuda_graph(cuda_ctx, cuda_graph, cuda_graph->cgraph); } + return GGML_STATUS_SUCCESS; +} +#endif -#else - bool use_cuda_graph = false; - bool cuda_graph_update_required = false; -#endif // USE_CUDA_GRAPH +static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context; - bool graph_evaluated_or_captured = false; + ggml_cuda_set_device(cuda_ctx->device); - evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required); + evaluate_cuda_graph(cuda_ctx, nullptr, cgraph); return GGML_STATUS_SUCCESS; } @@ -3195,10 +3215,17 @@ static const ggml_backend_i ggml_backend_cuda_interface = { /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async, /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async, /* .synchronize = */ ggml_backend_cuda_synchronize, +#ifdef USE_CUDA_GRAPH + /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create, + /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free, + /* .graph_plan_update = */ ggml_backend_cuda_graph_plan_update, + /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute, +#else /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, +#endif /* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .event_record = */ ggml_backend_cuda_event_record, /* .event_wait = */ ggml_backend_cuda_event_wait, diff --git a/ggml/src/ggml-cuda/mean.cu b/ggml/src/ggml-cuda/mean.cu index 347abc18660ca..67bbb0b62d8c5 100644 --- a/ggml/src/ggml-cuda/mean.cu +++ b/ggml/src/ggml-cuda/mean.cu @@ -10,7 +10,7 @@ template __global__ void divide_by_count(T * result, size_t count) *result /= static_cast(count); } -void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { +void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_cuda_graph * cuda_graph, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const float * src0_d = (const float *) src0->data; float * dst_d = (float *) dst->data; @@ -33,14 +33,12 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { #ifdef USE_CUDA_GRAPH // CUDA_GRAPHS_DISABLED ((ncols > 65536) && - ((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) || - ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates || - ctx.cuda_graph->disable_due_to_failed_graph_capture)) || + ((cuda_graph && cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) || + ctx.disable_due_to_gpu_arch)) || // CUDA_GRAPHS ENABLED ((ncols > 32768) && - !((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) || - ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates || - ctx.cuda_graph->disable_due_to_failed_graph_capture))) { + !((cuda_graph && cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) || + ctx.disable_due_to_gpu_arch))) { #else (ncols > 65536)) { #endif // USE_CUDA_GRAPH diff --git a/ggml/src/ggml-cuda/mean.cuh b/ggml/src/ggml-cuda/mean.cuh index 2b9b10433438e..14dca24736281 100644 --- a/ggml/src/ggml-cuda/mean.cuh +++ b/ggml/src/ggml-cuda/mean.cuh @@ -1,3 +1,3 @@ #include "common.cuh" -void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_cuda_graph * cuda_graph, ggml_tensor * dst); From 3afbd9f3270473fa9c54e4c816c62afa87683deb Mon Sep 17 00:00:00 2001 From: Xiangyan Sun Date: Wed, 15 Oct 2025 20:45:06 -0700 Subject: [PATCH 2/6] Fix cuda graph update logic. --- ggml/src/ggml-cuda/common.cuh | 5 +- ggml/src/ggml-cuda/ggml-cuda.cu | 84 ++++++++++++++++++++------------- ggml/src/ggml-cuda/mean.cu | 4 +- 3 files changed, 57 insertions(+), 36 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 4eb90cbbb39fb..84c39543372d7 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -940,6 +940,7 @@ struct ggml_cuda_graph { size_t num_nodes = 0; std::vector nodes; std::vector params; + int number_consecutive_updates = 0; std::vector ggml_graph_properties; #endif }; @@ -954,7 +955,9 @@ struct ggml_backend_cuda_context { #ifdef USE_CUDA_GRAPH bool cuda_graph_initialized = false; - bool disable_due_to_gpu_arch = false; + bool disable_graph_due_to_env = false; + bool disable_graph_due_to_gpu_arch = false; + bool disable_graph_due_to_too_many_updates = false; #endif explicit ggml_backend_cuda_context(int device) : diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 9f6a0500aa2bc..d93c86818c761 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2642,8 +2642,8 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { } #ifdef USE_CUDA_GRAPH -static bool check_node_graph_compatibility(const ggml_cgraph * cgraph, - bool use_cuda_graph) { +static bool check_node_graph_compatibility(const ggml_cgraph * cgraph) { + bool use_cuda_graph = true; // Loop over nodes in GGML graph to obtain info needed for CUDA graph @@ -2753,8 +2753,14 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra return true; } -static bool is_cuda_graph_update_required(ggml_cuda_graph * cuda_graph, const ggml_cgraph * cgraph) { +static void update_cuda_graph_properties(ggml_cuda_graph * cuda_graph, const ggml_cgraph * cgraph) { + cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes); + for (int i = 0; i < cgraph->n_nodes; i++) { + set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_graph->ggml_graph_properties[i]); + } +} +static bool is_cuda_graph_update_required(ggml_cuda_graph * cuda_graph, const ggml_cgraph * cgraph) { bool cuda_graph_update_required = false; if (cuda_graph->instance == nullptr) { @@ -2768,7 +2774,6 @@ static bool is_cuda_graph_update_required(ggml_cuda_graph * cuda_graph, const gg } // Loop over nodes in GGML graph to determine if CUDA graph update is required - // and store properties to allow this comparison for the next token for (int i = 0; i < cgraph->n_nodes; i++) { bool has_matching_properties = true; if (!cuda_graph_update_required) { @@ -2777,7 +2782,6 @@ static bool is_cuda_graph_update_required(ggml_cuda_graph * cuda_graph, const gg if (!has_matching_properties) { cuda_graph_update_required = true; } - set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_graph->ggml_graph_properties[i]); } return cuda_graph_update_required; @@ -3057,22 +3061,14 @@ static void capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cuda_g } } -static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph* cgraph) { - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; - - ggml_cuda_set_device(cuda_ctx->device); - - static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr); - - ggml_cuda_graph * cuda_graph = new ggml_cuda_graph(); - - cuda_graph->cgraph = cgraph; - +static bool should_use_cuda_graph(ggml_backend_cuda_context * cuda_ctx, const struct ggml_cgraph * cgraph) { bool use_cuda_graph = true; if (!cuda_ctx->cuda_graph_initialized) { + cuda_ctx->disable_graph_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr); + if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) { - cuda_ctx->disable_due_to_gpu_arch = true; + cuda_ctx->disable_graph_due_to_gpu_arch = true; #ifndef NDEBUG GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__); #endif @@ -3083,17 +3079,30 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly, // or previous graph capture failure. // Also disable for multi-gpu for now. TO DO investigate - if (disable_cuda_graphs_due_to_env - || cuda_ctx->disable_due_to_gpu_arch) { + if (cuda_ctx->disable_graph_due_to_env || cuda_ctx->disable_graph_due_to_gpu_arch || + cuda_ctx->disable_graph_due_to_too_many_updates) { use_cuda_graph = false; } if (use_cuda_graph) { - use_cuda_graph = check_node_graph_compatibility(cgraph, use_cuda_graph); + use_cuda_graph = check_node_graph_compatibility(cgraph); } - if (use_cuda_graph) { + return use_cuda_graph; +} + +static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + + ggml_cuda_set_device(cuda_ctx->device); + + ggml_cuda_graph * cuda_graph = new ggml_cuda_graph(); + + cuda_graph->cgraph = cgraph; + + if (should_use_cuda_graph(cuda_ctx, cgraph)) { capture_cuda_graph(cuda_ctx, cuda_graph, cgraph); + update_cuda_graph_properties(cuda_graph, cgraph); } return cuda_graph; @@ -3105,7 +3114,7 @@ static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backe GGML_UNUSED(backend); } -static void ggml_backend_cuda_graph_plan_update(ggml_backend_t backend, ggml_backend_graph_plan_t plan, const ggml_cgraph* cgraph) { +static void ggml_backend_cuda_graph_plan_update(ggml_backend_t backend, ggml_backend_graph_plan_t plan, const ggml_cgraph * cgraph) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context; ggml_cuda_set_device(cuda_ctx->device); @@ -3114,15 +3123,28 @@ static void ggml_backend_cuda_graph_plan_update(ggml_backend_t backend, ggml_bac cuda_graph->cgraph = cgraph; - if (!cuda_graph->graph) { - return; - } - - bool use_cuda_graph = true; + bool use_cuda_graph = should_use_cuda_graph(cuda_ctx, cgraph); + bool cuda_graph_update_required = false; - use_cuda_graph = check_node_graph_compatibility(cgraph, use_cuda_graph); + // check if we are doing a graph update + if (cuda_graph->instance == nullptr && use_cuda_graph // no graph -> graph + || cuda_graph->instance != nullptr && !use_cuda_graph // graph -> no graph + || use_cuda_graph && is_cuda_graph_update_required(cuda_graph, cgraph)) { // graph property mismatch + cuda_graph->number_consecutive_updates++; + if (cuda_graph->number_consecutive_updates >= 4) { + cuda_ctx->disable_graph_due_to_too_many_updates = true; + use_cuda_graph = false; + } else { + cuda_graph_update_required = true; + } + } else { + cuda_graph->number_consecutive_updates = 0; + } - if (!use_cuda_graph) { + if (use_cuda_graph && cuda_graph_update_required) { + capture_cuda_graph(cuda_ctx, cuda_graph, cgraph); + update_cuda_graph_properties(cuda_graph, cgraph); + } else if (!use_cuda_graph) { if (cuda_graph->instance != nullptr) { CUDA_CHECK(cudaGraphExecDestroy(cuda_graph->instance)); } @@ -3132,10 +3154,6 @@ static void ggml_backend_cuda_graph_plan_update(ggml_backend_t backend, ggml_bac cuda_graph->instance = nullptr; cuda_graph->graph = nullptr; } - - if (is_cuda_graph_update_required(cuda_graph, cgraph)) { - capture_cuda_graph(cuda_ctx, cuda_graph, cgraph); - } } static enum ggml_status ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { diff --git a/ggml/src/ggml-cuda/mean.cu b/ggml/src/ggml-cuda/mean.cu index 67bbb0b62d8c5..6e91f236b958a 100644 --- a/ggml/src/ggml-cuda/mean.cu +++ b/ggml/src/ggml-cuda/mean.cu @@ -34,11 +34,11 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_cuda_graph * cuda_g // CUDA_GRAPHS_DISABLED ((ncols > 65536) && ((cuda_graph && cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) || - ctx.disable_due_to_gpu_arch)) || + ctx.disable_graph_due_to_env || ctx.disable_graph_due_to_gpu_arch || ctx.disable_graph_due_to_too_many_updates)) || // CUDA_GRAPHS ENABLED ((ncols > 32768) && !((cuda_graph && cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) || - ctx.disable_due_to_gpu_arch))) { + ctx.disable_graph_due_to_env || ctx.disable_graph_due_to_gpu_arch || ctx.disable_graph_due_to_too_many_updates))) { #else (ncols > 65536)) { #endif // USE_CUDA_GRAPH From 5c89e8dd717449f45b5bfeec7aee99a7cb121802 Mon Sep 17 00:00:00 2001 From: Xiangyan Sun Date: Thu, 23 Oct 2025 09:24:43 -0700 Subject: [PATCH 3/6] Fix wording. --- ggml/src/ggml-backend.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index dd4007d89a3e7..718a688454c9a 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -732,7 +732,7 @@ struct ggml_backend_sched { struct ggml_backend_sched_plan * plans; int n_plans; int plans_capacity; - bool plan_dirty; + bool plan_needs_update; // pipeline parallelism support int n_copies; @@ -1412,7 +1412,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra assert(graph_copy->size > graph_copy->n_leafs); graph_copy->leafs[graph_copy->n_leafs++] = leaf; } - sched->plan_dirty = true; + sched->plan_needs_update = true; } static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { @@ -1456,7 +1456,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { static void ggml_backend_sched_update_plans(ggml_backend_sched_t sched) { // create graph plans - if (sched->plan_dirty) { + if (sched->plan_needs_update) { bool create_new_plans; if (sched->n_plans == sched->n_splits) { create_new_plans = false; @@ -1506,7 +1506,7 @@ static void ggml_backend_sched_update_plans(ggml_backend_sched_t sched) { } } } - sched->plan_dirty = false; + sched->plan_needs_update = false; } } From b7f366fad1942b7c7f92d09e872121a3b3fd3816 Mon Sep 17 00:00:00 2001 From: Xiangyan Sun Date: Thu, 23 Oct 2025 09:56:45 -0700 Subject: [PATCH 4/6] Fix faulty consecutive updates handling. --- ggml/src/ggml-cuda/common.cuh | 1 + ggml/src/ggml-cuda/ggml-cuda.cu | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 84c39543372d7..a6ee559eb2ca2 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -941,6 +941,7 @@ struct ggml_cuda_graph { std::vector nodes; std::vector params; int number_consecutive_updates = 0; + int number_consecutive_computes = 0; std::vector ggml_graph_properties; #endif }; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 919521d326d43..fb469d126834d 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3149,8 +3149,7 @@ static void ggml_backend_cuda_graph_plan_update(ggml_backend_t backend, ggml_bac } else { cuda_graph_update_required = true; } - } else { - cuda_graph->number_consecutive_updates = 0; + cuda_graph->number_consecutive_computes = 0; } if (use_cuda_graph && cuda_graph_update_required) { @@ -3175,6 +3174,11 @@ static enum ggml_status ggml_backend_cuda_graph_plan_compute(ggml_backend_t back ggml_cuda_set_device(cuda_ctx->device); + cuda_graph->number_consecutive_computes++; + if (cuda_graph->number_consecutive_computes > 1) { + cuda_graph->number_consecutive_updates = 0; + } + if (cuda_graph->instance) { CUDA_CHECK(cudaGraphLaunch(cuda_graph->instance, cuda_ctx->stream())); } else { From 5586c67351a8ca832a31e70ccd3b0bab6b48df2c Mon Sep 17 00:00:00 2001 From: Xiangyan Sun Date: Thu, 23 Oct 2025 10:20:42 -0700 Subject: [PATCH 5/6] Fix editorconfig check. --- ggml/src/ggml-cuda/ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index fb469d126834d..d5d70af273552 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3128,7 +3128,7 @@ static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backe static void ggml_backend_cuda_graph_plan_update(ggml_backend_t backend, ggml_backend_graph_plan_t plan, const ggml_cgraph * cgraph) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context; - + ggml_cuda_set_device(cuda_ctx->device); ggml_cuda_graph * cuda_graph = (ggml_cuda_graph *) plan; From de031bcc2cdf61850757b36cce6a616d27e3bc5e Mon Sep 17 00:00:00 2001 From: Xiangyan Sun Date: Thu, 23 Oct 2025 10:49:03 -0700 Subject: [PATCH 6/6] Forward declaration for ggml_backend_supports_graph_plan and ggml_backend_supports_graph_plan_update --- ggml/include/ggml-backend.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index c350ae205f15d..48ab40e678073 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -93,6 +93,8 @@ extern "C" { GGML_API void ggml_backend_synchronize(ggml_backend_t backend); + GGML_API bool ggml_backend_supports_graph_plan(ggml_backend_t backend); + GGML_API bool ggml_backend_supports_graph_plan_update(ggml_backend_t backend); GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); GGML_API void ggml_backend_graph_plan_update(ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);