@@ -787,6 +787,9 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s
787787
788788static  void  ggml_vk_submit (vk_context& ctx, vk::Fence fence) {
789789    if  (ctx->seqs .empty ()) {
790+         if  (fence) {
791+             ctx->q ->queue .submit ({}, fence);
792+         }
790793        return ;
791794    }
792795    VK_LOG_DEBUG (" ggml_vk_submit(" " , " " )" 
@@ -5658,11 +5661,15 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
56585661    }
56595662}
56605663
5661- static  void  ggml_vk_build_graph (ggml_backend_vk_context * ctx, ggml_tensor * node, int  node_idx, bool  last_node, bool  dryrun){
5664+ static  bool  ggml_vk_compute_forward (ggml_backend_vk_context* ctx, ggml_tensor* tensor, int  tensor_idx, bool  use_fence);
5665+ 
5666+ //  Returns true if node has enqueued work into the queue, false otherwise
5667+ //  If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
5668+ static  bool  ggml_vk_build_graph (ggml_backend_vk_context * ctx, ggml_tensor * node, int  node_idx, ggml_tensor *node_begin, int  node_idx_begin, bool  dryrun, bool  last_node, bool  submit){
56625669    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra ;
56635670
56645671    if  (ggml_is_empty (node) || extra == nullptr ) {
5665-         return ;
5672+         return   false ;
56665673    }
56675674
56685675    VK_LOG_DEBUG (" ggml_vk_build_graph(" " , " ggml_op_name (node->op ) << " )" 
@@ -5679,7 +5686,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
56795686    case  GGML_OP_PERMUTE:
56805687    case  GGML_OP_TRANSPOSE:
56815688    case  GGML_OP_NONE:
5682-         return ;
5689+         return   false ;
56835690    case  GGML_OP_UNARY:
56845691        switch  (ggml_get_unary_op (node)) {
56855692        case  GGML_UNARY_OP_SILU:
@@ -5689,7 +5696,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
56895696        case  GGML_UNARY_OP_TANH:
56905697            break ;
56915698        default :
5692-             return ;
5699+             return   false ;
56935700        }
56945701        break ;
56955702    case  GGML_OP_REPEAT:
@@ -5726,7 +5733,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
57265733    default :
57275734        std::cerr << " ggml_vulkan: Error: Missing op: " ggml_op_name (node->op ) << std::endl;
57285735        GGML_ABORT (" fatal error" 
5729-         return ;
5736+         return   false ;
57305737    }
57315738
57325739    vk_context compute_ctx;
@@ -5826,7 +5833,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
58265833            ggml_vk_unary (ctx, compute_ctx, src0, node, dryrun);
58275834            break ;
58285835        default :
5829-             return ;
5836+             return   false ;
58305837        }
58315838        break ;
58325839    case  GGML_OP_DIAG_MASK_INF:
@@ -5870,11 +5877,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
58705877
58715878        break ;
58725879    default :
5873-         return ;
5880+         return   false ;
58745881    }
58755882
58765883    if  (dryrun) {
5877-         return ;
5884+         return   false ;
58785885    }
58795886
58805887    ctx->tensor_ctxs [node_idx] = compute_ctx;
@@ -5885,14 +5892,34 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
58855892    last_node = true ;
58865893#endif 
58875894
5888-     if  (last_node) {
5895+     if  (submit ||  last_node) {
58895896        ggml_vk_ctx_end (compute_ctx);
5890-         compute_ctx->exit_tensor_idx  = node_idx;
5897+ 
5898+         //  TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward
5899+         if  (last_node) {
5900+             compute_ctx->exit_tensor_idx  = node_idx_begin;
5901+         }
5902+         else  {
5903+             compute_ctx->exit_tensor_idx  = -1 ;
5904+         }
5905+ 
58915906        ctx->compute_ctx .reset ();
5907+ 
5908+         bool  ok = ggml_vk_compute_forward (ctx, node_begin, node_idx_begin, false );
5909+         if  (!ok) {
5910+             if  (node->op  == GGML_OP_UNARY) {
5911+                 std::cerr << __func__ << " : error: op not supported UNARY " name  << "  (" ggml_unary_op_name (static_cast <ggml_unary_op>(node->op_params [0 ])) << " )" 
5912+             }
5913+             else  {
5914+                 std::cerr << __func__ << " : error: op not supported " name  << "  (" ggml_op_name (node->op ) << " )" 
5915+             }
5916+         }
5917+ 
58925918    }
5919+     return  true ;
58935920}
58945921
5895- static  bool  ggml_vk_compute_forward (ggml_backend_vk_context * ctx, ggml_tensor * tensor, int  tensor_idx){
5922+ static  bool  ggml_vk_compute_forward (ggml_backend_vk_context * ctx, ggml_tensor * tensor, int  tensor_idx,  bool  use_fence =  true ){
58965923    ggml_tensor_extra_gpu * extra = nullptr ;
58975924
58985925    switch  (tensor->op ) {
@@ -5960,40 +5987,38 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
59605987
59615988    VK_LOG_DEBUG (" ggml_vk_compute_forward(" " , name=" name  << " , op=" ggml_op_name (tensor->op ) << " , type=" type  << " , ne0=" ne [0 ] << " , ne1=" ne [1 ] << " , ne2=" ne [2 ] << " , ne3=" ne [3 ] << " , nb0=" nb [0 ] << " , nb1=" nb [1 ] << " , nb2=" nb [2 ] << " , nb3=" nb [3 ] << " , view_src=" view_src  << " , view_offs=" view_offs  << " )" 
59625989
5963- #ifdef  GGML_VULKAN_CHECK_RESULTS
5964-     ggml_vk_check_results_0 (tensor);
5965- #endif 
5966- 
59675990    vk_context subctx = ctx->tensor_ctxs [tensor_idx].lock ();
59685991
5969- #ifdef  GGML_VULKAN_PERF
5970-     std::chrono::steady_clock::time_point start;
5971- #endif  //  GGML_VULKAN_PERF
5992+     //  always wait for the GPU work to be done for the last submit
5993+     if  (tensor_idx == subctx->exit_tensor_idx ) {
5994+         use_fence = true ;
5995+     }
59725996
59735997    //  Only run if ctx hasn't been submitted yet
59745998    if  (!subctx->seqs .empty ()) {
5999+ #ifdef  GGML_VULKAN_CHECK_RESULTS
6000+         ggml_vk_check_results_0 (tensor);
6001+         use_fence = true ;
6002+ #endif 
6003+ 
59756004        //  Do staging buffer copies
59766005        for  (auto & cpy : subctx->in_memcpys ) {
59776006            memcpy (cpy.dst , cpy.src , cpy.n );
59786007        }
59796008
5980- #ifdef  GGML_VULKAN_PERF
5981-         start = std::chrono::steady_clock::now ();
5982- #endif  //  GGML_VULKAN_PERF
6009+         ggml_vk_submit (subctx, use_fence ? ctx->fence  : vk::Fence{});
6010+ 
6011+         if  (use_fence) {
6012+             VK_CHECK (ctx->device ->device .waitForFences ({ ctx->fence  }, true , UINT64_MAX), " ggml_vk_compute_forward waitForFences" 
59836013
5984-         ggml_vk_submit (subctx, ctx->fence );
6014+             ctx->device ->device .resetFences ({ ctx->fence  });
6015+         }
6016+ #ifdef  GGML_VULKAN_CHECK_RESULTS
6017+         ggml_vk_check_results_1 (tensor);
6018+ #endif 
59856019    }
59866020
59876021    if  (tensor_idx == subctx->exit_tensor_idx ) {
5988-         VK_CHECK (ctx->device ->device .waitForFences ({ ctx->fence  }, true , UINT64_MAX), " ggml_vk_compute_forward waitForFences" 
5989- 
5990- #ifdef  GGML_VULKAN_PERF
5991-         auto  duration = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now () - start);
5992-         ctx->device ->perf_logger ->log_timing (tensor, duration.count ());
5993- #endif  //  GGML_VULKAN_PERF
5994- 
5995-         ctx->device ->device .resetFences ({ ctx->fence  });
5996- 
59976022        //  Do staging buffer copies
59986023        for  (auto & cpy : subctx->out_memcpys ) {
59996024            memcpy (cpy.dst , cpy.src , cpy.n );
@@ -6482,7 +6507,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
64826507    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context ;
64836508
64846509    for  (int  i = 0 ; i < cgraph->n_nodes ; i++) {
6485-         ggml_vk_build_graph (ctx, cgraph->nodes [i], i, 0 , true );
6510+         ggml_vk_build_graph (ctx, cgraph->nodes [i], i, nullptr ,  0 , true ,  false ,  false );
64866511    }
64876512    ggml_vk_preallocate_buffers (ctx);
64886513    ggml_pipeline_allocate_descriptor_sets (ctx->device );
@@ -6497,31 +6522,36 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
64976522    //  Reserve tensor context space for all nodes
64986523    ctx->tensor_ctxs .resize (cgraph->n_nodes );
64996524
6500-     for  (int  i = 0 ; i < cgraph->n_nodes ; i++) {
6501-         ggml_vk_build_graph (ctx, cgraph->nodes [i], i, i == last_node, false );
6502-     }
6525+     bool  first_node_in_batch = true ; //  true if next node will be first node in a batch
6526+     int  submit_node_idx = 0 ; //  index to first node in a batch
65036527
6528+     //  submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
6529+     constexpr  int  submit_count = 100 ;
6530+     int  submitted_nodes = 0 ;
65046531    for  (int  i = 0 ; i < cgraph->n_nodes ; i++) {
6505-         ggml_tensor * node = cgraph->nodes [i];
6506- 
6507-         if  (ggml_vk_is_empty (node)) {
6508-             continue ;
6532+         if  (first_node_in_batch) {
6533+             submit_node_idx = i;
65096534        }
65106535
6511-         bool  ok = ggml_vk_compute_forward (ctx, node, i);
6512-         if  (!ok) {
6513-             if  (node->op  == GGML_OP_UNARY) {
6514-                 std::cerr << __func__ << " : error: op not supported UNARY " name  << "  (" ggml_unary_op_name (static_cast <ggml_unary_op>(node->op_params [0 ])) << " )" 
6515-             } else  {
6516-                 std::cerr << __func__ << " : error: op not supported " name  << "  (" ggml_op_name (node->op ) << " )" 
6536+         bool  submit = (submitted_nodes >= submit_count) || (i == last_node);
6537+ 
6538+ 
6539+         bool  enqueued = ggml_vk_build_graph (ctx, cgraph->nodes [i], i, cgraph->nodes [submit_node_idx], submit_node_idx, false , i == last_node, submit);
6540+ 
6541+         if  (enqueued) {
6542+             ++submitted_nodes;
6543+ 
6544+ #ifndef  GGML_VULKAN_CHECK_RESULTS
6545+             if  (first_node_in_batch) {
6546+                 first_node_in_batch = false ;
65176547            }
6548+ #endif 
65186549        }
6519- #ifdef  GGML_VULKAN_CHECK_RESULTS
6520-         else  {
6521-             ggml_vk_check_results_1 (node);
6550+ 
6551+         if  (submit) {
6552+             first_node_in_batch = true ;
6553+             submitted_nodes = 0 ;
65226554        }
6523- #endif 
6524-         GGML_ASSERT (ok);
65256555    }
65266556
65276557#ifdef  GGML_VULKAN_PERF
0 commit comments